diff --git a/.gitattributes b/.gitattributes
index cf17ba9d5e..6a3ee0fe72 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,11 +1,13 @@
 cuda/_version.py export-subst
 
 * text eol=lf
+*.cmd text eol=crlf
 
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
 # git should not convert line endings in PNG files
 *.png binary
+*.svg binary
 # SCM syntax highlighting & preventing 3-way merges
 pixi.lock merge=binary linguist-language=YAML linguist-generated=true
diff --git a/.spdx-ignore b/.spdx-ignore
index 84f051fafc..c7177752e1 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -11,3 +11,5 @@ cuda_bindings/examples/*
 
 # Vendored
 cuda_core/cuda/core/experimental/include/dlpack.h
+
+qa/ctk-next.drawio.svg
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 988b820919..77bcc2d3f6 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -19,41 +19,60 @@ linux:
   pull-request:
     - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
+    - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' }
   nightly: []
   special_runners:
     amd64:
       - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' }
+      - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' }
 
 windows:
   pull-request:
     - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtx2080', DRIVER: 'latest', DRIVER_MODE: 'WDDM' }
     - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtx4090', DRIVER: 'latest', DRIVER_MODE: 'WDDM' }
+    - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtx4090', DRIVER: 'latest', DRIVER_MODE: 'WDDM' }
     - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
+    - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
+    - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
+    - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
   nightly: []
diff --git a/ci/versions.yml b/ci/versions.yml
index 6ce7e3a32f..0e27f7ab63 100644
--- a/ci/versions.yml
+++ b/ci/versions.yml
@@ -3,6 +3,6 @@
 
 cuda:
   build:
-    version: "13.0.2"
+    version: "13.1.0"
   prev_build:
     version: "12.9.1"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 8038f8d95c..62db8b7d54 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
@@ -984,11 +984,21 @@ cdef CUresult _cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUdev
 cdef CUresult _cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMulticastBindMem_v2' in found_functions}}
+
+cdef CUresult _cuMulticastBindMem_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMulticastBindAddr' in found_functions}}
 
 cdef CUresult _cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMulticastBindAddr_v2' in found_functions}}
+
+cdef CUresult _cuMulticastBindAddr_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMulticastUnbind' in found_functions}}
 
 cdef CUresult _cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2161,7 +2171,7 @@ cdef CUresult _cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource
 
 {{if 'cuDevSmResourceSplitByCount' in found_functions}}
 
-cdef CUresult _cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remainder, unsigned int flags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuDevResourceGenerateDesc' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 664d322b88..9bf8931958 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
@@ -212,7 +212,9 @@ cdef bint __cuPythonInit = False
 {{if 'cuMulticastCreate' in found_functions}}cdef void *__cuMulticastCreate = NULL{{endif}}
 {{if 'cuMulticastAddDevice' in found_functions}}cdef void *__cuMulticastAddDevice = NULL{{endif}}
 {{if 'cuMulticastBindMem' in found_functions}}cdef void *__cuMulticastBindMem = NULL{{endif}}
+{{if 'cuMulticastBindMem_v2' in found_functions}}cdef void *__cuMulticastBindMem_v2 = NULL{{endif}}
 {{if 'cuMulticastBindAddr' in found_functions}}cdef void *__cuMulticastBindAddr = NULL{{endif}}
+{{if 'cuMulticastBindAddr_v2' in found_functions}}cdef void *__cuMulticastBindAddr_v2 = NULL{{endif}}
 {{if 'cuMulticastUnbind' in found_functions}}cdef void *__cuMulticastUnbind = NULL{{endif}}
 {{if 'cuMulticastGetGranularity' in found_functions}}cdef void *__cuMulticastGetGranularity = NULL{{endif}}
 {{if 'cuPointerGetAttribute' in found_functions}}cdef void *__cuPointerGetAttribute = NULL{{endif}}
@@ -1832,10 +1834,18 @@ cdef int _cuPythonInit() except -1 nogil:
             global __cuMulticastBindMem
             _F_cuGetProcAddress_v2('cuMulticastBindMem', &__cuMulticastBindMem, 12010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
+            {{if 'cuMulticastBindMem_v2' in found_functions}}
+            global __cuMulticastBindMem_v2
+            _F_cuGetProcAddress_v2('cuMulticastBindMem', &__cuMulticastBindMem_v2, 13010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
             {{if 'cuMulticastBindAddr' in found_functions}}
             global __cuMulticastBindAddr
             _F_cuGetProcAddress_v2('cuMulticastBindAddr', &__cuMulticastBindAddr, 12010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
+            {{if 'cuMulticastBindAddr_v2' in found_functions}}
+            global __cuMulticastBindAddr_v2
+            _F_cuGetProcAddress_v2('cuMulticastBindAddr', &__cuMulticastBindAddr_v2, 13010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
             {{if 'cuMulticastUnbind' in found_functions}}
             global __cuMulticastUnbind
             _F_cuGetProcAddress_v2('cuMulticastUnbind', &__cuMulticastUnbind, 12010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -4064,10 +4074,18 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuMulticastBindMem
         __cuMulticastBindMem = windll.GetProcAddress(handle, 'cuMulticastBindMem')
         {{endif}}
+        {{if 'cuMulticastBindMem_v2' in found_functions}}
+        global __cuMulticastBindMem_v2
+        __cuMulticastBindMem_v2 = windll.GetProcAddress(handle, 'cuMulticastBindMem_v2')
+        {{endif}}
         {{if 'cuMulticastBindAddr' in found_functions}}
         global __cuMulticastBindAddr
         __cuMulticastBindAddr = windll.GetProcAddress(handle, 'cuMulticastBindAddr')
         {{endif}}
+        {{if 'cuMulticastBindAddr_v2' in found_functions}}
+        global __cuMulticastBindAddr_v2
+        __cuMulticastBindAddr_v2 = windll.GetProcAddress(handle, 'cuMulticastBindAddr_v2')
+        {{endif}}
         {{if 'cuMulticastUnbind' in found_functions}}
         global __cuMulticastUnbind
         __cuMulticastUnbind = windll.GetProcAddress(handle, 'cuMulticastUnbind')
@@ -6293,10 +6311,18 @@ cdef int _cuPythonInit() except -1 nogil:
         global __cuMulticastBindMem
         __cuMulticastBindMem = dlfcn.dlsym(handle, 'cuMulticastBindMem')
         {{endif}}
+        {{if 'cuMulticastBindMem_v2' in found_functions}}
+        global __cuMulticastBindMem_v2
+        __cuMulticastBindMem_v2 = dlfcn.dlsym(handle, 'cuMulticastBindMem_v2')
+        {{endif}}
         {{if 'cuMulticastBindAddr' in found_functions}}
         global __cuMulticastBindAddr
         __cuMulticastBindAddr = dlfcn.dlsym(handle, 'cuMulticastBindAddr')
         {{endif}}
+        {{if 'cuMulticastBindAddr_v2' in found_functions}}
+        global __cuMulticastBindAddr_v2
+        __cuMulticastBindAddr_v2 = dlfcn.dlsym(handle, 'cuMulticastBindAddr_v2')
+        {{endif}}
         {{if 'cuMulticastUnbind' in found_functions}}
         global __cuMulticastUnbind
         __cuMulticastUnbind = dlfcn.dlsym(handle, 'cuMulticastUnbind')
@@ -9584,6 +9610,18 @@ cdef CUresult _cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t
     return err
 {{endif}}
 
+{{if 'cuMulticastBindMem_v2' in found_functions}}
+
+cdef CUresult _cuMulticastBindMem_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMulticastBindMem_v2
+    cuPythonInit()
+    if __cuMulticastBindMem_v2 == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMulticastBindMem_v2" not found')
+    err = (<CUresult (*)(CUmemGenericAllocationHandle, CUdevice, size_t, CUmemGenericAllocationHandle, size_t, size_t, unsigned long long) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMulticastBindMem_v2)(mcHandle, dev, mcOffset, memHandle, memOffset, size, flags)
+    return err
+{{endif}}
+
 {{if 'cuMulticastBindAddr' in found_functions}}
 
 cdef CUresult _cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -9596,6 +9634,18 @@ cdef CUresult _cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t
     return err
 {{endif}}
 
+{{if 'cuMulticastBindAddr_v2' in found_functions}}
+
+cdef CUresult _cuMulticastBindAddr_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMulticastBindAddr_v2
+    cuPythonInit()
+    if __cuMulticastBindAddr_v2 == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMulticastBindAddr_v2" not found')
+    err = (<CUresult (*)(CUmemGenericAllocationHandle, CUdevice, size_t, CUdeviceptr, size_t, unsigned long long) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMulticastBindAddr_v2)(mcHandle, dev, mcOffset, memptr, size, flags)
+    return err
+{{endif}}
+
 {{if 'cuMulticastUnbind' in found_functions}}
 
 cdef CUresult _cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12406,13 +12456,13 @@ cdef CUresult _cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource
 
 {{if 'cuDevSmResourceSplitByCount' in found_functions}}
 
-cdef CUresult _cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult _cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remainder, unsigned int flags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil:
     global __cuDevSmResourceSplitByCount
     cuPythonInit()
     if __cuDevSmResourceSplitByCount == NULL:
         with gil:
             raise RuntimeError('Function "cuDevSmResourceSplitByCount" not found')
-    err = (<CUresult (*)(CUdevResource*, unsigned int*, const CUdevResource*, CUdevResource*, unsigned int, unsigned int) except ?CUDA_ERROR_NOT_FOUND nogil> __cuDevSmResourceSplitByCount)(result, nbGroups, input, remaining, useFlags, minCount)
+    err = (<CUresult (*)(CUdevResource*, unsigned int*, const CUdevResource*, CUdevResource*, unsigned int, unsigned int) except ?CUDA_ERROR_NOT_FOUND nogil> __cuDevSmResourceSplitByCount)(result, nbGroups, input, remainder, flags, minCount)
     return err
 {{endif}}
 
@@ -14254,6 +14304,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuMulticastBindMem"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuMulticastBindMem_v2' in found_functions}}
+    global __cuMulticastBindMem_v2
+    data["__cuMulticastBindMem_v2"] = <intptr_t>__cuMulticastBindMem_v2
+    {{else}}
+    data["__cuMulticastBindMem_v2"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuMulticastBindAddr' in found_functions}}
     global __cuMulticastBindAddr
     data["__cuMulticastBindAddr"] = <intptr_t>__cuMulticastBindAddr
@@ -14261,6 +14318,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuMulticastBindAddr"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuMulticastBindAddr_v2' in found_functions}}
+    global __cuMulticastBindAddr_v2
+    data["__cuMulticastBindAddr_v2"] = <intptr_t>__cuMulticastBindAddr_v2
+    {{else}}
+    data["__cuMulticastBindAddr_v2"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuMulticastUnbind' in found_functions}}
     global __cuMulticastUnbind
     data["__cuMulticastUnbind"] = <intptr_t>__cuMulticastUnbind
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
index 148530a86a..f1bbb53998 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 from cuda.bindings.cynvrtc cimport *
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 8409032852..608aebd1af 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 175a931515..4878608749 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
@@ -1441,6 +1441,26 @@ cdef cudaError_t _cudaLibraryEnumerateKernels(cudaKernel_t* kernels, unsigned in
 cdef cudaError_t _cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncAttribute attr, int value, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t _cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 2d5a2efdaa..8f309560f9 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
@@ -2609,6 +2609,42 @@ cdef cudaError_t _cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncA
     return cudaKernelSetAttributeForDevice(kernel, attr, value, device)
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaExecutionCtxDestroy(ctx)
+    return cudaExecutionCtxDestroy(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaExecutionCtxSynchronize(ctx)
+    return cudaExecutionCtxSynchronize(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaExecutionCtxRecordEvent(ctx, event)
+    return cudaExecutionCtxRecordEvent(ctx, event)
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaExecutionCtxWaitEvent(ctx, event)
+    return cudaExecutionCtxWaitEvent(ctx, event)
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t _cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 9c1769482b..663cd130cd 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
@@ -1444,6 +1444,26 @@ cdef cudaError_t _cudaLibraryEnumerateKernels(cudaKernel_t* kernels, unsigned in
 cdef cudaError_t _cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncAttribute attr, int value, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t _cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index 51271166c2..5101d9f603 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
@@ -1733,6 +1733,30 @@ cdef cudaError_t _cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncA
     return cudaKernelSetAttributeForDevice(kernel, attr, value, device)
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaExecutionCtxDestroy(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaExecutionCtxSynchronize(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaExecutionCtxRecordEvent(ctx, event)
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t _cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaExecutionCtxWaitEvent(ctx, event)
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t _cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile.pxd b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
index cdbb776fd9..8352bee6f2 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 from ..cycufile cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
index 2f4580d791..c64eb00c5c 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 import threading
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index 6a2821253a..636e4fa963 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from ..cynvjitlink cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index ccc412b0f8..8c96b6d640 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 1b88b99892..2901536558 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
index 2cbe920218..0bfbd77744 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from ..cynvvm cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index e1addcc9ee..408a2cb592 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
index de3e789a41..f57e579a20 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_version.py b/cuda_bindings/cuda/bindings/_version.py
index 366d8645c0..79e2814c71 100644
--- a/cuda_bindings/cuda/bindings/_version.py
+++ b/cuda_bindings/cuda/bindings/_version.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-__version__ = "13.0.3"
+__version__ = "13.1.0"
diff --git a/cuda_bindings/cuda/bindings/cufile.pxd b/cuda_bindings/cuda/bindings/cufile.pxd
index 9fa2361ccd..abcabe9a90 100644
--- a/cuda_bindings/cuda/bindings/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/cufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -37,6 +37,7 @@ ctypedef CUFileSizeTConfigParameter_t _SizeTConfigParameter
 ctypedef CUFileBoolConfigParameter_t _BoolConfigParameter
 ctypedef CUFileStringConfigParameter_t _StringConfigParameter
 ctypedef CUFileArrayConfigParameter_t _ArrayConfigParameter
+ctypedef CUfileP2PFlags_t _P2PFlags
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cufile.pyx b/cuda_bindings/cuda/bindings/cufile.pyx
index 338f2cb8a6..24e52fed11 100644
--- a/cuda_bindings/cuda/bindings/cufile.pyx
+++ b/cuda_bindings/cuda/bindings/cufile.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc cimport errno
@@ -2561,6 +2561,8 @@ class DriverStatusFlags(_IntEnum):
     BEEGFS_SUPPORTED = CU_FILE_BEEGFS_SUPPORTED
     NVME_P2P_SUPPORTED = CU_FILE_NVME_P2P_SUPPORTED
     SCATEFS_SUPPORTED = CU_FILE_SCATEFS_SUPPORTED
+    VIRTIOFS_SUPPORTED = CU_FILE_VIRTIOFS_SUPPORTED
+    MAX_TARGET_TYPES = CU_FILE_MAX_TARGET_TYPES
 
 class DriverControlFlags(_IntEnum):
     """See `CUfileDriverControlFlags_t`."""
@@ -2573,6 +2575,7 @@ class FeatureFlags(_IntEnum):
     BATCH_IO_SUPPORTED = CU_FILE_BATCH_IO_SUPPORTED
     STREAMS_SUPPORTED = CU_FILE_STREAMS_SUPPORTED
     PARALLEL_IO_SUPPORTED = CU_FILE_PARALLEL_IO_SUPPORTED
+    P2P_SUPPORTED = CU_FILE_P2P_SUPPORTED
 
 class FileHandleType(_IntEnum):
     """See `CUfileFileHandleType`."""
@@ -2640,6 +2643,14 @@ class ArrayConfigParameter(_IntEnum):
     POSIX_POOL_SLAB_SIZE_KB = CUFILE_PARAM_POSIX_POOL_SLAB_SIZE_KB
     POSIX_POOL_SLAB_COUNT = CUFILE_PARAM_POSIX_POOL_SLAB_COUNT
 
+class P2PFlags(_IntEnum):
+    """See `CUfileP2PFlags_t`."""
+    P2PDMA = CUFILE_P2PDMA
+    NVFS = CUFILE_NVFS
+    DMABUF = CUFILE_DMABUF
+    C2C = CUFILE_C2C
+    NVIDIA_PEERMEM = CUFILE_NVIDIA_PEERMEM
+
 
 ###############################################################################
 # Error handling
diff --git a/cuda_bindings/cuda/bindings/cycufile.pxd b/cuda_bindings/cuda/bindings/cycufile.pxd
index aa8ea93d48..f5dcb2cedf 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pxd
+++ b/cuda_bindings/cuda/bindings/cycufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 from libc.time cimport time_t
@@ -99,6 +99,8 @@ cdef extern from '<cufile.h>':
         CU_FILE_BEEGFS_SUPPORTED
         CU_FILE_NVME_P2P_SUPPORTED
         CU_FILE_SCATEFS_SUPPORTED
+        CU_FILE_VIRTIOFS_SUPPORTED
+        CU_FILE_MAX_TARGET_TYPES
 
 cdef extern from '<cufile.h>':
     ctypedef enum CUfileDriverControlFlags_t:
@@ -111,6 +113,7 @@ cdef extern from '<cufile.h>':
         CU_FILE_BATCH_IO_SUPPORTED
         CU_FILE_STREAMS_SUPPORTED
         CU_FILE_PARALLEL_IO_SUPPORTED
+        CU_FILE_P2P_SUPPORTED
 
 cdef extern from '<cufile.h>':
     ctypedef enum CUfileFileHandleType:
@@ -178,6 +181,14 @@ cdef extern from '<cufile.h>':
         CUFILE_PARAM_POSIX_POOL_SLAB_SIZE_KB
         CUFILE_PARAM_POSIX_POOL_SLAB_COUNT
 
+cdef extern from '<cufile.h>':
+    ctypedef enum CUfileP2PFlags_t:
+        CUFILE_P2PDMA
+        CUFILE_NVFS
+        CUFILE_DMABUF
+        CUFILE_C2C
+        CUFILE_NVIDIA_PEERMEM
+
     # types
 cdef extern from '<cufile.h>':
     ctypedef void* CUfileHandle_t 'CUfileHandle_t'
@@ -206,11 +217,11 @@ cdef extern from '<cufile.h>':
 
 cdef extern from '<cufile.h>':
     ctypedef struct CUfileFSOps_t 'CUfileFSOps_t':
-        char* (*fs_type)(void*)
-        int (*getRDMADeviceList)(void*, sockaddr_t**)
-        int (*getRDMADevicePriority)(void*, char*, size_t, loff_t, sockaddr_t*)
-        ssize_t (*read)(void*, char*, size_t, loff_t, cufileRDMAInfo_t*)
-        ssize_t (*write)(void*, const char*, size_t, loff_t, cufileRDMAInfo_t*)
+        char* (*fs_type)(const void*)
+        int (*getRDMADeviceList)(const void*, sockaddr_t**)
+        int (*getRDMADevicePriority)(const void*, char*, size_t, loff_t, const sockaddr_t*)
+        ssize_t (*read)(const void*, char*, size_t, loff_t, const cufileRDMAInfo_t*)
+        ssize_t (*write)(const void*, const char*, size_t, loff_t, const cufileRDMAInfo_t*)
 
 cdef union _anon_pod1 '_anon_pod1':
     int fd
diff --git a/cuda_bindings/cuda/bindings/cycufile.pyx b/cuda_bindings/cuda/bindings/cycufile.pyx
index f1589fcd17..5353c12382 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pyx
+++ b/cuda_bindings/cuda/bindings/cycufile.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly.
 
 from ._internal cimport cufile as _cufile
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index e3c22aba68..6107e3b4d7 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -221,6 +221,35 @@ cdef extern from "cuda.h":
 
     ctypedef CUevent_wait_flags_enum CUevent_wait_flags
 
+    cdef enum CUatomicOperation_enum:
+        CU_ATOMIC_OPERATION_INTEGER_ADD = 0
+        CU_ATOMIC_OPERATION_INTEGER_MIN = 1
+        CU_ATOMIC_OPERATION_INTEGER_MAX = 2
+        CU_ATOMIC_OPERATION_INTEGER_INCREMENT = 3
+        CU_ATOMIC_OPERATION_INTEGER_DECREMENT = 4
+        CU_ATOMIC_OPERATION_AND = 5
+        CU_ATOMIC_OPERATION_OR = 6
+        CU_ATOMIC_OPERATION_XOR = 7
+        CU_ATOMIC_OPERATION_EXCHANGE = 8
+        CU_ATOMIC_OPERATION_CAS = 9
+        CU_ATOMIC_OPERATION_FLOAT_ADD = 10
+        CU_ATOMIC_OPERATION_FLOAT_MIN = 11
+        CU_ATOMIC_OPERATION_FLOAT_MAX = 12
+        CU_ATOMIC_OPERATION_MAX = 13
+
+    ctypedef CUatomicOperation_enum CUatomicOperation
+
+    cdef enum CUatomicOperationCapability_enum:
+        CU_ATOMIC_CAPABILITY_SIGNED = 1
+        CU_ATOMIC_CAPABILITY_UNSIGNED = 2
+        CU_ATOMIC_CAPABILITY_REDUCTION = 4
+        CU_ATOMIC_CAPABILITY_SCALAR_32 = 8
+        CU_ATOMIC_CAPABILITY_SCALAR_64 = 16
+        CU_ATOMIC_CAPABILITY_SCALAR_128 = 32
+        CU_ATOMIC_CAPABILITY_VECTOR_32x4 = 64
+
+    ctypedef CUatomicOperationCapability_enum CUatomicOperationCapability
+
     cdef enum CUstreamWaitValue_flags_enum:
         CU_STREAM_WAIT_VALUE_GEQ = 0
         CU_STREAM_WAIT_VALUE_EQ = 1
@@ -243,6 +272,7 @@ cdef extern from "cuda.h":
         CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4
         CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5
         CU_STREAM_MEM_OP_BARRIER = 6
+        CU_STREAM_MEM_OP_ATOMIC_REDUCTION = 8
 
     ctypedef CUstreamBatchMemOpType_enum CUstreamBatchMemOpType
 
@@ -252,6 +282,19 @@ cdef extern from "cuda.h":
 
     ctypedef CUstreamMemoryBarrier_flags_enum CUstreamMemoryBarrier_flags
 
+    cdef enum CUstreamAtomicReductionOpType_enum:
+        CU_STREAM_ATOMIC_REDUCTION_OP_ADD = 0
+        CU_STREAM_ATOMIC_REDUCTION_OP_AND = 5
+        CU_STREAM_ATOMIC_REDUCTION_OP_OR = 6
+
+    ctypedef CUstreamAtomicReductionOpType_enum CUstreamAtomicReductionOpType
+
+    cdef enum CUstreamAtomicReductionDataType_enum:
+        CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32 = 14
+        CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_64 = 22
+
+    ctypedef CUstreamAtomicReductionDataType_enum CUstreamAtomicReductionDataType
+
     cdef struct CUstreamMemOpWaitValueParams_st:
         CUstreamBatchMemOpType operation
         CUdeviceptr address
@@ -276,12 +319,22 @@ cdef extern from "cuda.h":
         CUstreamBatchMemOpType operation
         unsigned int flags
 
+    cdef struct CUstreamMemOpAtomicReductionParams_st:
+        CUstreamBatchMemOpType operation
+        unsigned int flags
+        CUstreamAtomicReductionOpType reductionOp
+        CUstreamAtomicReductionDataType dataType
+        CUdeviceptr address
+        cuuint64_t value
+        CUdeviceptr alias
+
     cdef union CUstreamBatchMemOpParams_union:
         CUstreamBatchMemOpType operation
         CUstreamMemOpWaitValueParams_st waitValue
         CUstreamMemOpWriteValueParams_st writeValue
         CUstreamMemOpFlushRemoteWritesParams_st flushRemoteWrites
         CUstreamMemOpMemoryBarrierParams_st memoryBarrier
+        CUstreamMemOpAtomicReductionParams_st atomicReduction
         cuuint64_t pad[6]
 
     ctypedef CUstreamBatchMemOpParams_union CUstreamBatchMemOpParams_v1
@@ -347,6 +400,20 @@ cdef extern from "cuda.h":
         CU_AD_FORMAT_HALF = 16
         CU_AD_FORMAT_FLOAT = 32
         CU_AD_FORMAT_UNORM_INT_101010_2 = 80
+        CU_AD_FORMAT_UINT8_PACKED_422 = 81
+        CU_AD_FORMAT_UINT8_PACKED_444 = 82
+        CU_AD_FORMAT_UINT8_SEMIPLANAR_420 = 83
+        CU_AD_FORMAT_UINT16_SEMIPLANAR_420 = 84
+        CU_AD_FORMAT_UINT8_SEMIPLANAR_422 = 85
+        CU_AD_FORMAT_UINT16_SEMIPLANAR_422 = 86
+        CU_AD_FORMAT_UINT8_SEMIPLANAR_444 = 87
+        CU_AD_FORMAT_UINT16_SEMIPLANAR_444 = 88
+        CU_AD_FORMAT_UINT8_PLANAR_420 = 89
+        CU_AD_FORMAT_UINT16_PLANAR_420 = 90
+        CU_AD_FORMAT_UINT8_PLANAR_422 = 91
+        CU_AD_FORMAT_UINT16_PLANAR_422 = 92
+        CU_AD_FORMAT_UINT8_PLANAR_444 = 93
+        CU_AD_FORMAT_UINT16_PLANAR_444 = 94
         CU_AD_FORMAT_BC1_UNORM = 145
         CU_AD_FORMAT_BC1_UNORM_SRGB = 146
         CU_AD_FORMAT_BC2_UNORM = 147
@@ -561,7 +628,8 @@ cdef extern from "cuda.h":
         CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 145
         CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = 146
         CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = 147
-        CU_DEVICE_ATTRIBUTE_MAX = 148
+        CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148
+        CU_DEVICE_ATTRIBUTE_MAX = 149
 
     ctypedef CUdevice_attribute_enum CUdevice_attribute
 
@@ -722,7 +790,8 @@ cdef extern from "cuda.h":
         CU_JIT_MAX_THREADS_PER_BLOCK = 32
         CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33
         CU_JIT_SPLIT_COMPILE = 34
-        CU_JIT_NUM_OPTIONS = 35
+        CU_JIT_BINARY_LOADER_THREAD_COUNT = 35
+        CU_JIT_NUM_OPTIONS = 36
 
     ctypedef CUjit_option_enum CUjit_option
 
@@ -1319,6 +1388,7 @@ cdef extern from "cuda.h":
         CUDA_ERROR_INVALID_RESOURCE_TYPE = 914
         CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915
         CUDA_ERROR_KEY_ROTATION = 916
+        CUDA_ERROR_STREAM_DETACHED = 917
         CUDA_ERROR_UNKNOWN = 999
 
     ctypedef cudaError_enum CUresult
@@ -1333,35 +1403,6 @@ cdef extern from "cuda.h":
 
     ctypedef CUdevice_P2PAttribute_enum CUdevice_P2PAttribute
 
-    cdef enum CUatomicOperation_enum:
-        CU_ATOMIC_OPERATION_INTEGER_ADD = 0
-        CU_ATOMIC_OPERATION_INTEGER_MIN = 1
-        CU_ATOMIC_OPERATION_INTEGER_MAX = 2
-        CU_ATOMIC_OPERATION_INTEGER_INCREMENT = 3
-        CU_ATOMIC_OPERATION_INTEGER_DECREMENT = 4
-        CU_ATOMIC_OPERATION_AND = 5
-        CU_ATOMIC_OPERATION_OR = 6
-        CU_ATOMIC_OPERATION_XOR = 7
-        CU_ATOMIC_OPERATION_EXCHANGE = 8
-        CU_ATOMIC_OPERATION_CAS = 9
-        CU_ATOMIC_OPERATION_FLOAT_ADD = 10
-        CU_ATOMIC_OPERATION_FLOAT_MIN = 11
-        CU_ATOMIC_OPERATION_FLOAT_MAX = 12
-        CU_ATOMIC_OPERATION_MAX = 13
-
-    ctypedef CUatomicOperation_enum CUatomicOperation
-
-    cdef enum CUatomicOperationCapability_enum:
-        CU_ATOMIC_CAPABILITY_SIGNED = 1
-        CU_ATOMIC_CAPABILITY_UNSIGNED = 2
-        CU_ATOMIC_CAPABILITY_REDUCTION = 4
-        CU_ATOMIC_CAPABILITY_SCALAR_32 = 8
-        CU_ATOMIC_CAPABILITY_SCALAR_64 = 16
-        CU_ATOMIC_CAPABILITY_SCALAR_128 = 32
-        CU_ATOMIC_CAPABILITY_VECTOR_32x4 = 64
-
-    ctypedef CUatomicOperationCapability_enum CUatomicOperationCapability
-
     ctypedef void (*CUstreamCallback)(CUstream hStream, CUresult status, void* userData)
 
     ctypedef size_t (*CUoccupancyB2DSize)(int blockSize)
@@ -2423,6 +2464,7 @@ cdef extern from "cuda.h":
         CU_COREDUMP_SKIP_ABORT = 16
         CU_COREDUMP_SKIP_CONSTBANK_MEMORY = 32
         CU_COREDUMP_LIGHTWEIGHT_FLAGS = 47
+        CU_COREDUMP_GZIP_COMPRESS = 64
 
     cdef struct CUdevResourceDesc_st:
         pass
@@ -2431,26 +2473,61 @@ cdef extern from "cuda.h":
     ctypedef enum CUgreenCtxCreate_flags:
         CU_GREEN_CTX_DEFAULT_STREAM = 1
 
-    ctypedef enum CUdevSmResourceSplit_flags:
+    ctypedef enum CUdevSmResourceGroup_flags:
+        CU_DEV_SM_RESOURCE_GROUP_DEFAULT = 0
+        CU_DEV_SM_RESOURCE_GROUP_BACKFILL = 1
+
+    ctypedef enum CUdevSmResourceSplitByCount_flags:
         CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 1
         CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 2
 
     ctypedef enum CUdevResourceType:
         CU_DEV_RESOURCE_TYPE_INVALID = 0
         CU_DEV_RESOURCE_TYPE_SM = 1
+        CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG = 1000
+        CU_DEV_RESOURCE_TYPE_WORKQUEUE = 10000
 
     cdef struct CUdevSmResource_st:
         unsigned int smCount
         unsigned int minSmPartitionSize
         unsigned int smCoscheduledAlignment
+        unsigned int flags
 
     ctypedef CUdevSmResource_st CUdevSmResource
 
+    ctypedef enum CUdevWorkqueueConfigScope:
+        CU_WORKQUEUE_SCOPE_DEVICE_CTX = 0
+        CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED = 1
+
+    cdef struct CUdevWorkqueueConfigResource_st:
+        CUdevice device
+        unsigned int wqConcurrencyLimit
+        CUdevWorkqueueConfigScope sharingScope
+
+    ctypedef CUdevWorkqueueConfigResource_st CUdevWorkqueueConfigResource
+
+    cdef struct CUdevWorkqueueResource_st:
+        unsigned char reserved[40]
+
+    ctypedef CUdevWorkqueueResource_st CUdevWorkqueueResource
+
+    cdef struct CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
+        unsigned int smCount
+        unsigned int coscheduledSmCount
+        unsigned int preferredCoscheduledSmCount
+        unsigned int flags
+        unsigned int reserved[12]
+
+    ctypedef CU_DEV_SM_RESOURCE_GROUP_PARAMS_st CU_DEV_SM_RESOURCE_GROUP_PARAMS
+
     cdef struct CUdevResource_st:
         CUdevResourceType type
         unsigned char _internal_padding[92]
         CUdevSmResource sm
-        unsigned char _oversize[48]
+        CUdevWorkqueueConfigResource wqConfig
+        CUdevWorkqueueResource wq
+        unsigned char _oversize[40]
+        CUdevResource_st* nextResource
 
     ctypedef CUdevResource_st CUdevResource_v1
 
@@ -3660,11 +3737,21 @@ cdef CUresult cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUdevi
 cdef CUresult cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMulticastBindMem_v2' in found_functions}}
+
+cdef CUresult cuMulticastBindMem_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMulticastBindAddr' in found_functions}}
 
 cdef CUresult cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMulticastBindAddr_v2' in found_functions}}
+
+cdef CUresult cuMulticastBindAddr_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMulticastUnbind' in found_functions}}
 
 cdef CUresult cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4837,7 +4924,7 @@ cdef CUresult cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource,
 
 {{if 'cuDevSmResourceSplitByCount' in found_functions}}
 
-cdef CUresult cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remainder, unsigned int flags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuDevResourceGenerateDesc' in found_functions}}
@@ -5030,7 +5117,7 @@ cdef CUresult cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaResou
 cdef CUresult cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-cdef enum: CUDA_VERSION = 13000
+cdef enum: CUDA_VERSION = 13010
 
 cdef enum: CU_IPC_HANDLE_SIZE = 64
 
@@ -5170,10 +5257,6 @@ cdef enum: CU_DEVICE_CPU = -1
 
 cdef enum: CU_DEVICE_INVALID = -2
 
-cdef enum: RESOURCE_ABI_VERSION = 1
-
-cdef enum: RESOURCE_ABI_EXTERNAL_BYTES = 48
-
 cdef enum: MAX_PLANES = 3
 
 cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index 757e977eae..869adaa0f7 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
@@ -1180,12 +1180,24 @@ cdef CUresult cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t m
     return cydriver._cuMulticastBindMem(mcHandle, mcOffset, memHandle, memOffset, size, flags)
 {{endif}}
 
+{{if 'cuMulticastBindMem_v2' in found_functions}}
+
+cdef CUresult cuMulticastBindMem_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMulticastBindMem_v2(mcHandle, dev, mcOffset, memHandle, memOffset, size, flags)
+{{endif}}
+
 {{if 'cuMulticastBindAddr' in found_functions}}
 
 cdef CUresult cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuMulticastBindAddr(mcHandle, mcOffset, memptr, size, flags)
 {{endif}}
 
+{{if 'cuMulticastBindAddr_v2' in found_functions}}
+
+cdef CUresult cuMulticastBindAddr_v2(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMulticastBindAddr_v2(mcHandle, dev, mcOffset, memptr, size, flags)
+{{endif}}
+
 {{if 'cuMulticastUnbind' in found_functions}}
 
 cdef CUresult cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -2592,8 +2604,8 @@ cdef CUresult cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource,
 
 {{if 'cuDevSmResourceSplitByCount' in found_functions}}
 
-cdef CUresult cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuDevSmResourceSplitByCount(result, nbGroups, input, remaining, useFlags, minCount)
+cdef CUresult cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remainder, unsigned int flags, unsigned int minCount) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuDevSmResourceSplitByCount(result, nbGroups, input, remainder, flags, minCount)
 {{endif}}
 
 {{if 'cuDevResourceGenerateDesc' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 17d5e83e61..6c6d0722ba 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 516bbceead..f31d6f00ac 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
index 7a392687dd..af5acab52d 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index b8c19e73d0..423efcf54c 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cynvrtc as cynvrtc
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pxd b/cuda_bindings/cuda/bindings/cynvvm.pxd
index 58c77f1e02..48bf32856c 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pxd
+++ b/cuda_bindings/cuda/bindings/cynvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pyx b/cuda_bindings/cuda/bindings/cynvvm.pyx
index 608c6d0a9e..34a0089065 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pyx
+++ b/cuda_bindings/cuda/bindings/cynvvm.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from ._internal cimport nvvm as _nvvm
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index bd0bc3d5f3..3f66f1eaa4 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -176,12 +176,12 @@ cdef struct cudaEglPlaneDesc_st:
 
 ctypedef cudaEglPlaneDesc_st cudaEglPlaneDesc
 
-cdef union anon_union9:
+cdef union anon_union10:
     cudaArray_t pArray[3]
     cudaPitchedPtr pPitch[3]
 
 cdef struct cudaEglFrame_st:
-    anon_union9 frame
+    anon_union10 frame
     cudaEglPlaneDesc planeDesc[3]
     unsigned int planeCount
     cudaEglFrameType frameType
@@ -1639,6 +1639,26 @@ cdef cudaError_t cudaLibraryEnumerateKernels(cudaKernel_t* kernels, unsigned int
 cdef cudaError_t cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncAttribute attr, int value, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -1952,8 +1972,8 @@ cdef enum: cudaTextureType2DLayered = 242
 
 cdef enum: cudaTextureTypeCubemapLayered = 252
 
-cdef enum: CUDART_VERSION = 13000
+cdef enum: CUDART_VERSION = 13010
 
-cdef enum: __CUDART_API_VERSION = 13000
+cdef enum: __CUDART_API_VERSION = 13010
 
 cdef enum: CUDA_EGL_MAX_PLANES = 3
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index 3031e43d2f..3dfe699c85 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
@@ -1727,6 +1727,30 @@ cdef cudaError_t cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncAt
     return cyruntime._cudaKernelSetAttributeForDevice(kernel, attr, value, device)
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaExecutionCtxDestroy(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaExecutionCtxSynchronize(ctx)
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaExecutionCtxRecordEvent(ctx, event)
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+cdef cudaError_t cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaExecutionCtxWaitEvent(ctx, event)
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 cdef cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index 14230f1a26..118e16e2ee 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
@@ -1438,6 +1438,26 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaKernelSetAttributeForDevice(cudaKernel_t kernel, cudaFuncAttribute attr, int value, int device) nogil
 
+    {{endif}}
+    {{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+    cudaError_t cudaExecutionCtxDestroy(cudaExecutionContext_t ctx) nogil
+
+    {{endif}}
+    {{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+    cudaError_t cudaExecutionCtxSynchronize(cudaExecutionContext_t ctx) nogil
+
+    {{endif}}
+    {{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+    cudaError_t cudaExecutionCtxRecordEvent(cudaExecutionContext_t ctx, cudaEvent_t event) nogil
+
+    {{endif}}
+    {{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+    cudaError_t cudaExecutionCtxWaitEvent(cudaExecutionContext_t ctx, cudaEvent_t event) nogil
+
     {{endif}}
     {{if 'cudaGetExportTable' in found_functions}}
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index 6aca016f7e..3af28f67e7 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
@@ -145,11 +145,20 @@ cdef extern from "driver_types.h":
         cudaErrorFunctionNotLoaded = 913
         cudaErrorInvalidResourceType = 914
         cudaErrorInvalidResourceConfiguration = 915
+        cudaErrorStreamDetached = 917
         cudaErrorUnknown = 999
         cudaErrorApiFailureBase = 10000
 
     ctypedef cudaError cudaError_t
 
+    cdef struct CUdevResourceDesc_st:
+        pass
+    ctypedef CUdevResourceDesc_st* cudaDevResourceDesc_t
+
+    cdef struct cudaExecutionContext_st:
+        pass
+    ctypedef cudaExecutionContext_st* cudaExecutionContext_t
+
     cdef struct cudaChannelFormatDesc:
         int x
         int y
@@ -218,7 +227,8 @@ cdef extern from "driver_types.h":
 
     cdef struct cudaMemcpyNodeParams:
         int flags
-        int reserved[3]
+        int reserved
+        cudaExecutionContext_t ctx
         cudaMemcpy3DParms copyParams
 
     cdef struct cudaMemcpy3DPeerParms:
@@ -247,6 +257,7 @@ cdef extern from "driver_types.h":
         unsigned int elementSize
         size_t width
         size_t height
+        cudaExecutionContext_t ctx
 
     cdef struct cudaAccessPolicyWindow:
         void* base_ptr
@@ -613,6 +624,40 @@ cdef extern from "driver_types.h":
         unsigned int flags
         unsigned int reserved[16]
 
+    cdef struct cudaDevSmResource:
+        unsigned int smCount
+        unsigned int minSmPartitionSize
+        unsigned int smCoscheduledAlignment
+        unsigned int flags
+
+    cdef struct cudaDevWorkqueueConfigResource:
+        int device
+        unsigned int wqConcurrencyLimit
+        cudaDevWorkqueueConfigScope sharingScope
+
+    cdef struct cudaDevWorkqueueResource:
+        unsigned char reserved[40]
+
+    cdef struct cudaDevSmResourceGroupParams_st:
+        unsigned int smCount
+        unsigned int coscheduledSmCount
+        unsigned int preferredCoscheduledSmCount
+        unsigned int flags
+        unsigned int reserved[12]
+
+    ctypedef cudaDevSmResourceGroupParams_st cudaDevSmResourceGroupParams
+
+    cdef struct cudaDevResource_st:
+        cudaDevResourceType type
+        unsigned char _internal_padding[92]
+        cudaDevSmResource sm
+        cudaDevWorkqueueConfigResource wqConfig
+        cudaDevWorkqueueResource wq
+        unsigned char _oversize[40]
+        cudaDevResource_st* nextResource
+
+    ctypedef cudaDevResource_st cudaDevResource
+
     cdef struct CUstream_st:
         pass
     ctypedef CUstream_st* cudaStream_t
@@ -684,6 +729,7 @@ cdef extern from "driver_types.h":
         unsigned int sharedMemBytes
         void** kernelParams
         void** extra
+        cudaExecutionContext_t ctx
 
     cdef struct cudaExternalSemaphoreSignalNodeParams:
         cudaExternalSemaphore_t* extSemArray
@@ -710,6 +756,7 @@ cdef extern from "driver_types.h":
         cudaGraphConditionalNodeType type
         unsigned int size
         cudaGraph_t* phGraph_out
+        cudaExecutionContext_t ctx
 
     cdef struct cudaChildGraphNodeParams:
         cudaGraph_t graph
@@ -789,7 +836,7 @@ cdef extern from "driver_types.h":
         size_t offset
         size_t size
 
-    cdef union anon_union7:
+    cdef union anon_union8:
         dim3 gridDim
         anon_struct16 param
         unsigned int isEnabled
@@ -797,7 +844,7 @@ cdef extern from "driver_types.h":
     cdef struct cudaGraphKernelNodeUpdate:
         cudaGraphDeviceNode_t node
         cudaGraphKernelNodeField field
-        anon_union7 updateData
+        anon_union8 updateData
 
     cdef enum cudaLaunchMemSyncDomain:
         cudaLaunchMemSyncDomainDefault = 0
@@ -886,12 +933,12 @@ cdef extern from "driver_types.h":
     cdef struct anon_struct22:
         unsigned long long bytesOverBudget
 
-    cdef union anon_union8:
+    cdef union anon_union9:
         anon_struct22 overBudget
 
     cdef struct cudaAsyncNotificationInfo:
         cudaAsyncNotificationType type
-        anon_union8 info
+        anon_union9 info
 
     ctypedef cudaAsyncNotificationInfo cudaAsyncNotificationInfo_t
 
@@ -1382,6 +1429,24 @@ cdef extern from "driver_types.h":
         cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9
         cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10
 
+    cdef enum cudaDevSmResourceGroup_flags:
+        cudaDevSmResourceGroupDefault = 0
+        cudaDevSmResourceGroupBackfill = 1
+
+    cdef enum cudaDevSmResourceSplitByCount_flags:
+        cudaDevSmResourceSplitIgnoreSmCoscheduling = 1
+        cudaDevSmResourceSplitMaxPotentialClusterSize = 2
+
+    cdef enum cudaDevResourceType:
+        cudaDevResourceTypeInvalid = 0
+        cudaDevResourceTypeSm = 1
+        cudaDevResourceTypeWorkqueueConfig = 1000
+        cudaDevResourceTypeWorkqueue = 10000
+
+    cdef enum cudaDevWorkqueueConfigScope:
+        cudaDevWorkqueueConfigScopeDeviceCtx = 0
+        cudaDevWorkqueueConfigScopeGreenCtxBalanced = 1
+
     cdef enum cudaJitOption:
         cudaJitMaxRegisters = 0
         cudaJitThreadsPerBlock = 1
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 0a0131f5bf..9035d1f9d8 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -794,6 +794,57 @@ cdef class CUstreamMemOpMemoryBarrierParams_st:
     """
     cdef cydriver.CUstreamBatchMemOpParams_union* _pvt_ptr
 {{endif}}
+{{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+
+cdef class CUstreamMemOpAtomicReductionParams_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
+    operation : CUstreamBatchMemOpType
+
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
+    flags : unsigned int
+        Must be 0
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
+    reductionOp : CUstreamAtomicReductionOpType
+        See CUstreamAtomicReductionOpType
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
+    dataType : CUstreamAtomicReductionDataType
+        See CUstreamAtomicReductionDataType
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+    address : CUdeviceptr
+        The address the atomic operation will be operated on
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+    value : cuuint64_t
+        The operand value the atomic operation will operate with
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+    alias : CUdeviceptr
+        For driver internal use. Initial value is unimportant.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUstreamBatchMemOpParams_union* _pvt_ptr
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+    cdef CUdeviceptr _address
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+    cdef cuuint64_t _value
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+    cdef CUdeviceptr _alias
+    {{endif}}
+{{endif}}
 {{if 'CUstreamBatchMemOpParams_union' in found_struct}}
 
 cdef class CUstreamBatchMemOpParams_union:
@@ -824,6 +875,10 @@ cdef class CUstreamBatchMemOpParams_union:
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
         Params for CU_STREAM_MEM_OP_BARRIER operations.
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : list[cuuint64_t]
@@ -849,6 +904,9 @@ cdef class CUstreamBatchMemOpParams_union:
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     cdef CUstreamMemOpMemoryBarrierParams_st _memoryBarrier
     {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    cdef CUstreamMemOpAtomicReductionParams_st _atomicReduction
+    {{endif}}
 {{endif}}
 {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st' in found_struct}}
 
@@ -5184,20 +5242,23 @@ cdef class CUdevSmResource_st:
     {{if 'CUdevSmResource_st.smCount' in found_struct}}
     smCount : unsigned int
         The amount of streaming multiprocessors available in this resource.
-        This is an output parameter only, do not write to this field.
     {{endif}}
     {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
     minSmPartitionSize : unsigned int
         The minimum number of streaming multiprocessors required to
-        partition this resource. This is an output parameter only, do not
-        write to this field.
+        partition this resource.
     {{endif}}
     {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
     smCoscheduledAlignment : unsigned int
         The number of streaming multiprocessors in this resource that are
         guaranteed to be co-scheduled on the same GPU processing cluster.
-        smCount is a multiple of this value. This is an output parameter
-        only, do not write to this field.
+        smCount will be a multiple of this value, unless the backfill flag
+        is set.
+    {{endif}}
+    {{if 'CUdevSmResource_st.flags' in found_struct}}
+    flags : unsigned int
+        The flags set on this SM resource. For possible values see
+        ::CUdevSmResourceGroup_flags.
     {{endif}}
 
     Methods
@@ -5208,6 +5269,93 @@ cdef class CUdevSmResource_st:
     cdef cydriver.CUdevSmResource_st _pvt_val
     cdef cydriver.CUdevSmResource_st* _pvt_ptr
 {{endif}}
+{{if 'CUdevWorkqueueConfigResource_st' in found_struct}}
+
+cdef class CUdevWorkqueueConfigResource_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+    device : CUdevice
+        The device on which the workqueue resources are available
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
+    wqConcurrencyLimit : unsigned int
+        The expected maximum number of concurrent stream-ordered workloads
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
+    sharingScope : CUdevWorkqueueConfigScope
+        The sharing scope for the workqueue resources
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUdevWorkqueueConfigResource_st _pvt_val
+    cdef cydriver.CUdevWorkqueueConfigResource_st* _pvt_ptr
+    {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+    cdef CUdevice _device
+    {{endif}}
+{{endif}}
+{{if 'CUdevWorkqueueResource_st' in found_struct}}
+
+cdef class CUdevWorkqueueResource_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueResource_st.reserved' in found_struct}}
+    reserved : bytes
+        Reserved for future use
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CUdevWorkqueueResource_st _pvt_val
+    cdef cydriver.CUdevWorkqueueResource_st* _pvt_ptr
+{{endif}}
+{{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st' in found_struct}}
+
+cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
+    """
+    Attributes
+    ----------
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `CUdevSmResourceGroup_flags` values to indicate this
+        this group is created.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS_st _pvt_val
+    cdef cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS_st* _pvt_ptr
+{{endif}}
 {{if 'CUdevResource_st' in found_struct}}
 
 cdef class CUdevResource_st:
@@ -5226,9 +5374,23 @@ cdef class CUdevResource_st:
     sm : CUdevSmResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    wqConfig : CUdevWorkqueueConfigResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
+        `typename`.
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    wq : CUdevWorkqueueResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
+        `typename`.
+    {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
+    {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    nextResource : CUdevResource_st
+
     {{endif}}
 
     Methods
@@ -5241,6 +5403,16 @@ cdef class CUdevResource_st:
     {{if 'CUdevResource_st.sm' in found_struct}}
     cdef CUdevSmResource _sm
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    cdef CUdevWorkqueueConfigResource _wqConfig
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    cdef CUdevWorkqueueResource _wq
+    {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    cdef size_t _nextResource_length
+    cdef cydriver.CUdevResource_st* _nextResource
+    {{endif}}
 {{endif}}
 {{if True}}
 
@@ -5578,6 +5750,10 @@ cdef class CUstreamBatchMemOpParams_v1(CUstreamBatchMemOpParams_union):
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
         Params for CU_STREAM_MEM_OP_BARRIER operations.
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : list[cuuint64_t]
@@ -5621,6 +5797,10 @@ cdef class CUstreamBatchMemOpParams(CUstreamBatchMemOpParams_v1):
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
         Params for CU_STREAM_MEM_OP_BARRIER operations.
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : list[cuuint64_t]
@@ -10720,20 +10900,104 @@ cdef class CUdevSmResource(CUdevSmResource_st):
     {{if 'CUdevSmResource_st.smCount' in found_struct}}
     smCount : unsigned int
         The amount of streaming multiprocessors available in this resource.
-        This is an output parameter only, do not write to this field.
     {{endif}}
     {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
     minSmPartitionSize : unsigned int
         The minimum number of streaming multiprocessors required to
-        partition this resource. This is an output parameter only, do not
-        write to this field.
+        partition this resource.
     {{endif}}
     {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
     smCoscheduledAlignment : unsigned int
         The number of streaming multiprocessors in this resource that are
         guaranteed to be co-scheduled on the same GPU processing cluster.
-        smCount is a multiple of this value. This is an output parameter
-        only, do not write to this field.
+        smCount will be a multiple of this value, unless the backfill flag
+        is set.
+    {{endif}}
+    {{if 'CUdevSmResource_st.flags' in found_struct}}
+    flags : unsigned int
+        The flags set on this SM resource. For possible values see
+        ::CUdevSmResourceGroup_flags.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
+{{if 'CUdevWorkqueueConfigResource' in found_types}}
+
+cdef class CUdevWorkqueueConfigResource(CUdevWorkqueueConfigResource_st):
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+    device : CUdevice
+        The device on which the workqueue resources are available
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
+    wqConcurrencyLimit : unsigned int
+        The expected maximum number of concurrent stream-ordered workloads
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
+    sharingScope : CUdevWorkqueueConfigScope
+        The sharing scope for the workqueue resources
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
+{{if 'CUdevWorkqueueResource' in found_types}}
+
+cdef class CUdevWorkqueueResource(CUdevWorkqueueResource_st):
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueResource_st.reserved' in found_struct}}
+    reserved : bytes
+        Reserved for future use
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
+{{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS' in found_types}}
+
+cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS(CU_DEV_SM_RESOURCE_GROUP_PARAMS_st):
+    """
+    Attributes
+    ----------
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `CUdevSmResourceGroup_flags` values to indicate this
+        this group is created.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
     {{endif}}
 
     Methods
@@ -10761,9 +11025,23 @@ cdef class CUdevResource_v1(CUdevResource_st):
     sm : CUdevSmResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    wqConfig : CUdevWorkqueueConfigResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
+        `typename`.
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    wq : CUdevWorkqueueResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
+        `typename`.
+    {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
+    {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    nextResource : CUdevResource_st
+
     {{endif}}
 
     Methods
@@ -10791,9 +11069,23 @@ cdef class CUdevResource(CUdevResource_v1):
     sm : CUdevSmResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    wqConfig : CUdevWorkqueueConfigResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
+        `typename`.
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    wq : CUdevWorkqueueResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
+        `typename`.
+    {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
+    {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    nextResource : CUdevResource_st
+
     {{endif}}
 
     Methods
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 33c1c770fa..b6ca9b7adc 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 from typing import Any, Optional
 from enum import IntEnum
 import cython
@@ -340,10 +340,6 @@ CU_DEVICE_CPU = cydriver.CU_DEVICE_CPU
 #: Device that represents an invalid device
 CU_DEVICE_INVALID = cydriver.CU_DEVICE_INVALID
 
-RESOURCE_ABI_VERSION = cydriver.RESOURCE_ABI_VERSION
-
-RESOURCE_ABI_EXTERNAL_BYTES = cydriver.RESOURCE_ABI_EXTERNAL_BYTES
-
 #: Maximum number of planes per frame
 MAX_PLANES = cydriver.MAX_PLANES
 
@@ -595,6 +591,66 @@ class CUevent_wait_flags(IntEnum):
 
 _dict_CUevent_wait_flags = dict(((int(v), v) for k, v in CUevent_wait_flags.__members__.items()))
 {{endif}}
+{{if 'CUatomicOperation_enum' in found_types}}
+
+class CUatomicOperation(IntEnum):
+    """
+    CUDA-valid Atomic Operations
+    """
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_ADD' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_ADD{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_MIN' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MIN{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MAX{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_INCREMENT' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_INCREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_INCREMENT{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_DECREMENT' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_DECREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_DECREMENT{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_AND' in found_values}}
+    CU_ATOMIC_OPERATION_AND = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_AND{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_OR' in found_values}}
+    CU_ATOMIC_OPERATION_OR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_OR{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_XOR' in found_values}}
+    CU_ATOMIC_OPERATION_XOR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_XOR{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_EXCHANGE' in found_values}}
+    CU_ATOMIC_OPERATION_EXCHANGE = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_EXCHANGE{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_CAS' in found_values}}
+    CU_ATOMIC_OPERATION_CAS = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_CAS{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_ADD' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_ADD{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_MIN' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MIN{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MAX{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_MAX{{endif}}
+
+_dict_CUatomicOperation = dict(((int(v), v) for k, v in CUatomicOperation.__members__.items()))
+{{endif}}
+{{if 'CUatomicOperationCapability_enum' in found_types}}
+
+class CUatomicOperationCapability(IntEnum):
+    """
+    CUDA-valid Atomic Operation capabilities
+    """
+    {{if 'CU_ATOMIC_CAPABILITY_SIGNED' in found_values}}
+    CU_ATOMIC_CAPABILITY_SIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SIGNED{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_UNSIGNED' in found_values}}
+    CU_ATOMIC_CAPABILITY_UNSIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_UNSIGNED{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_REDUCTION' in found_values}}
+    CU_ATOMIC_CAPABILITY_REDUCTION = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_REDUCTION{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_32' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_32 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_32{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_64' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_64 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_64{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_128' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_128 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_128{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_VECTOR_32x4' in found_values}}
+    CU_ATOMIC_CAPABILITY_VECTOR_32x4 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_VECTOR_32x4{{endif}}
+
+_dict_CUatomicOperationCapability = dict(((int(v), v) for k, v in CUatomicOperationCapability.__members__.items()))
+{{endif}}
 {{if 'CUstreamWaitValue_flags_enum' in found_types}}
 
 class CUstreamWaitValue_flags(IntEnum):
@@ -691,6 +747,11 @@ class CUstreamBatchMemOpType(IntEnum):
 
     #: Insert a memory barrier of the specified type
     CU_STREAM_MEM_OP_BARRIER = cydriver.CUstreamBatchMemOpType_enum.CU_STREAM_MEM_OP_BARRIER{{endif}}
+    {{if 'CU_STREAM_MEM_OP_ATOMIC_REDUCTION' in found_values}}
+
+    #: Perform a atomic reduction. See
+    #: :py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction
+    CU_STREAM_MEM_OP_ATOMIC_REDUCTION = cydriver.CUstreamBatchMemOpType_enum.CU_STREAM_MEM_OP_ATOMIC_REDUCTION{{endif}}
 
 _dict_CUstreamBatchMemOpType = dict(((int(v), v) for k, v in CUstreamBatchMemOpType.__members__.items()))
 {{endif}}
@@ -711,6 +772,42 @@ class CUstreamMemoryBarrier_flags(IntEnum):
 
 _dict_CUstreamMemoryBarrier_flags = dict(((int(v), v) for k, v in CUstreamMemoryBarrier_flags.__members__.items()))
 {{endif}}
+{{if 'CUstreamAtomicReductionOpType_enum' in found_types}}
+
+class CUstreamAtomicReductionOpType(IntEnum):
+    """
+    Atomic reduction operation types for
+    :py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction::reductionOp
+    """
+    {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_ADD' in found_values}}
+
+    #: Performs an atomic ADD: *(address) = *(address) + value
+    CU_STREAM_ATOMIC_REDUCTION_OP_ADD = cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_ADD{{endif}}
+    {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_AND' in found_values}}
+
+    #: Performs an atomic AND: *(address) = *(address) & value
+    CU_STREAM_ATOMIC_REDUCTION_OP_AND = cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_AND{{endif}}
+    {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_OR' in found_values}}
+
+    #: Performs an atomic OR: *(address) = *(address) | value
+    CU_STREAM_ATOMIC_REDUCTION_OP_OR = cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_OR{{endif}}
+
+_dict_CUstreamAtomicReductionOpType = dict(((int(v), v) for k, v in CUstreamAtomicReductionOpType.__members__.items()))
+{{endif}}
+{{if 'CUstreamAtomicReductionDataType_enum' in found_types}}
+
+class CUstreamAtomicReductionDataType(IntEnum):
+    """
+    Atomic reduction data types for
+    :py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction::dataType
+    """
+    {{if 'CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32' in found_values}}
+    CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32 = cydriver.CUstreamAtomicReductionDataType_enum.CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32{{endif}}
+    {{if 'CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_64' in found_values}}
+    CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_64 = cydriver.CUstreamAtomicReductionDataType_enum.CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_64{{endif}}
+
+_dict_CUstreamAtomicReductionDataType = dict(((int(v), v) for k, v in CUstreamAtomicReductionDataType.__members__.items()))
+{{endif}}
 {{if 'CUoccupancy_flags_enum' in found_types}}
 
 class CUoccupancy_flags(IntEnum):
@@ -801,6 +898,65 @@ class CUarray_format(IntEnum):
 
     #: 4 channel unorm R10G10B10A2 RGB format
     CU_AD_FORMAT_UNORM_INT_101010_2 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UNORM_INT_101010_2{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_PACKED_422' in found_values}}
+
+    #: 4 channel unsigned 8-bit YUV packed format, with 4:2:2 sampling
+    CU_AD_FORMAT_UINT8_PACKED_422 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_PACKED_422{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_PACKED_444' in found_values}}
+
+    #: 4 channel unsigned 8-bit YUV packed format, with 4:4:4 sampling
+    CU_AD_FORMAT_UINT8_PACKED_444 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_PACKED_444{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_SEMIPLANAR_420' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV semi-planar format, with 4:2:0 sampling
+    CU_AD_FORMAT_UINT8_SEMIPLANAR_420 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_SEMIPLANAR_420{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_SEMIPLANAR_420' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV semi-planar format, with 4:2:0
+    #: sampling
+    CU_AD_FORMAT_UINT16_SEMIPLANAR_420 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_SEMIPLANAR_420{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_SEMIPLANAR_422' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV semi-planar format, with 4:2:2 sampling
+    CU_AD_FORMAT_UINT8_SEMIPLANAR_422 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_SEMIPLANAR_422{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_SEMIPLANAR_422' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV semi-planar format, with 4:2:2
+    #: sampling
+    CU_AD_FORMAT_UINT16_SEMIPLANAR_422 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_SEMIPLANAR_422{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_SEMIPLANAR_444' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV semi-planar format, with 4:4:4 sampling
+    CU_AD_FORMAT_UINT8_SEMIPLANAR_444 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_SEMIPLANAR_444{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_SEMIPLANAR_444' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV semi-planar format, with 4:4:4
+    #: sampling
+    CU_AD_FORMAT_UINT16_SEMIPLANAR_444 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_SEMIPLANAR_444{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_PLANAR_420' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV planar format, with 4:2:0 sampling
+    CU_AD_FORMAT_UINT8_PLANAR_420 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_PLANAR_420{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_PLANAR_420' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV planar format, with 4:2:0 sampling
+    CU_AD_FORMAT_UINT16_PLANAR_420 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_PLANAR_420{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_PLANAR_422' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV planar format, with 4:2:2 sampling
+    CU_AD_FORMAT_UINT8_PLANAR_422 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_PLANAR_422{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_PLANAR_422' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV planar format, with 4:2:2 sampling
+    CU_AD_FORMAT_UINT16_PLANAR_422 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_PLANAR_422{{endif}}
+    {{if 'CU_AD_FORMAT_UINT8_PLANAR_444' in found_values}}
+
+    #: 3 channel unsigned 8-bit YUV planar format, with 4:4:4 sampling
+    CU_AD_FORMAT_UINT8_PLANAR_444 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT8_PLANAR_444{{endif}}
+    {{if 'CU_AD_FORMAT_UINT16_PLANAR_444' in found_values}}
+
+    #: 3 channel unsigned 16-bit YUV planar format, with 4:4:4 sampling
+    CU_AD_FORMAT_UINT16_PLANAR_444 = cydriver.CUarray_format_enum.CU_AD_FORMAT_UINT16_PLANAR_444{{endif}}
     {{if 'CU_AD_FORMAT_BC1_UNORM' in found_values}}
 
     #: 4 channel unsigned normalized block-compressed (BC1 compression)
@@ -1704,6 +1860,11 @@ class CUdevice_attribute(IntEnum):
     #: Link between the device and the host supports only some native
     #: atomic operations
     CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED' in found_values}}
+
+    #: Device supports atomic reduction operations in stream batch memory
+    #: operations
+    CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED{{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAX' in found_values}}
     CU_DEVICE_ATTRIBUTE_MAX = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAX{{endif}}
 
@@ -2433,6 +2594,17 @@ class CUjit_option(IntEnum):
     #: used. Option type: unsigned int
     #: Applies to: compiler only
     CU_JIT_SPLIT_COMPILE = cydriver.CUjit_option_enum.CU_JIT_SPLIT_COMPILE{{endif}}
+    {{if 'CU_JIT_BINARY_LOADER_THREAD_COUNT' in found_values}}
+
+    #: This option specifies the maximum number of concurrent threads to
+    #: use when compiling device code. If the specified value is 1, the
+    #: option will be ignored. If the specified value is 0, the number of
+    #: threads will match the number of CPUs on the underlying machine.
+    #: Otherwise, if the option is N, then up to N threads will be used.
+    #: This option is ignored if the env var
+    #: CUDA_BINARY_LOADER_THREAD_COUNT is set. Option type: unsigned int
+    #: Applies to: compiler and linker
+    CU_JIT_BINARY_LOADER_THREAD_COUNT = cydriver.CUjit_option_enum.CU_JIT_BINARY_LOADER_THREAD_COUNT{{endif}}
     {{if 'CU_JIT_NUM_OPTIONS' in found_values}}
     CU_JIT_NUM_OPTIONS = cydriver.CUjit_option_enum.CU_JIT_NUM_OPTIONS{{endif}}
 
@@ -4029,6 +4201,13 @@ class CUresult(IntEnum):
     #: This error indicates that an error happened during the key rotation
     #: sequence.
     CUDA_ERROR_KEY_ROTATION = cydriver.cudaError_enum.CUDA_ERROR_KEY_ROTATION{{endif}}
+    {{if 'CUDA_ERROR_STREAM_DETACHED' in found_values}}
+
+    #: This error indicates that the requested operation is not permitted
+    #: because the stream is in a detached state. This can occur if the
+    #: green context associated with the stream has been destroyed,
+    #: limiting the stream's operational capabilities.
+    CUDA_ERROR_STREAM_DETACHED = cydriver.cudaError_enum.CUDA_ERROR_STREAM_DETACHED{{endif}}
     {{if 'CUDA_ERROR_UNKNOWN' in found_values}}
 
     #: This indicates that an unknown internal error has occurred.
@@ -4070,66 +4249,6 @@ class CUdevice_P2PAttribute(IntEnum):
 
 _dict_CUdevice_P2PAttribute = dict(((int(v), v) for k, v in CUdevice_P2PAttribute.__members__.items()))
 {{endif}}
-{{if 'CUatomicOperation_enum' in found_types}}
-
-class CUatomicOperation(IntEnum):
-    """
-    CUDA-valid Atomic Operations
-    """
-    {{if 'CU_ATOMIC_OPERATION_INTEGER_ADD' in found_values}}
-    CU_ATOMIC_OPERATION_INTEGER_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_ADD{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_INTEGER_MIN' in found_values}}
-    CU_ATOMIC_OPERATION_INTEGER_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MIN{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_INTEGER_MAX' in found_values}}
-    CU_ATOMIC_OPERATION_INTEGER_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MAX{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_INTEGER_INCREMENT' in found_values}}
-    CU_ATOMIC_OPERATION_INTEGER_INCREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_INCREMENT{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_INTEGER_DECREMENT' in found_values}}
-    CU_ATOMIC_OPERATION_INTEGER_DECREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_DECREMENT{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_AND' in found_values}}
-    CU_ATOMIC_OPERATION_AND = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_AND{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_OR' in found_values}}
-    CU_ATOMIC_OPERATION_OR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_OR{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_XOR' in found_values}}
-    CU_ATOMIC_OPERATION_XOR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_XOR{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_EXCHANGE' in found_values}}
-    CU_ATOMIC_OPERATION_EXCHANGE = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_EXCHANGE{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_CAS' in found_values}}
-    CU_ATOMIC_OPERATION_CAS = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_CAS{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_FLOAT_ADD' in found_values}}
-    CU_ATOMIC_OPERATION_FLOAT_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_ADD{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_FLOAT_MIN' in found_values}}
-    CU_ATOMIC_OPERATION_FLOAT_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MIN{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_FLOAT_MAX' in found_values}}
-    CU_ATOMIC_OPERATION_FLOAT_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MAX{{endif}}
-    {{if 'CU_ATOMIC_OPERATION_MAX' in found_values}}
-    CU_ATOMIC_OPERATION_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_MAX{{endif}}
-
-_dict_CUatomicOperation = dict(((int(v), v) for k, v in CUatomicOperation.__members__.items()))
-{{endif}}
-{{if 'CUatomicOperationCapability_enum' in found_types}}
-
-class CUatomicOperationCapability(IntEnum):
-    """
-    CUDA-valid Atomic Operation capabilities
-    """
-    {{if 'CU_ATOMIC_CAPABILITY_SIGNED' in found_values}}
-    CU_ATOMIC_CAPABILITY_SIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SIGNED{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_UNSIGNED' in found_values}}
-    CU_ATOMIC_CAPABILITY_UNSIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_UNSIGNED{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_REDUCTION' in found_values}}
-    CU_ATOMIC_CAPABILITY_REDUCTION = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_REDUCTION{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_32' in found_values}}
-    CU_ATOMIC_CAPABILITY_SCALAR_32 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_32{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_64' in found_values}}
-    CU_ATOMIC_CAPABILITY_SCALAR_64 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_64{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_128' in found_values}}
-    CU_ATOMIC_CAPABILITY_SCALAR_128 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_128{{endif}}
-    {{if 'CU_ATOMIC_CAPABILITY_VECTOR_32x4' in found_values}}
-    CU_ATOMIC_CAPABILITY_VECTOR_32x4 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_VECTOR_32x4{{endif}}
-
-_dict_CUatomicOperationCapability = dict(((int(v), v) for k, v in CUatomicOperationCapability.__members__.items()))
-{{endif}}
 {{if 'CUresourceViewFormat_enum' in found_types}}
 
 class CUresourceViewFormat(IntEnum):
@@ -5354,6 +5473,8 @@ class CUCoredumpGenerationFlags(IntEnum):
     CU_COREDUMP_SKIP_CONSTBANK_MEMORY = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_SKIP_CONSTBANK_MEMORY{{endif}}
     {{if 'CU_COREDUMP_LIGHTWEIGHT_FLAGS' in found_values}}
     CU_COREDUMP_LIGHTWEIGHT_FLAGS = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_LIGHTWEIGHT_FLAGS{{endif}}
+    {{if 'CU_COREDUMP_GZIP_COMPRESS' in found_values}}
+    CU_COREDUMP_GZIP_COMPRESS = cydriver.CUCoredumpGenerationFlags.CU_COREDUMP_GZIP_COMPRESS{{endif}}
 
 _dict_CUCoredumpGenerationFlags = dict(((int(v), v) for k, v in CUCoredumpGenerationFlags.__members__.items()))
 {{endif}}
@@ -5370,18 +5491,31 @@ class CUgreenCtxCreate_flags(IntEnum):
 
 _dict_CUgreenCtxCreate_flags = dict(((int(v), v) for k, v in CUgreenCtxCreate_flags.__members__.items()))
 {{endif}}
-{{if 'CUdevSmResourceSplit_flags' in found_types}}
+{{if 'CUdevSmResourceGroup_flags' in found_types}}
+
+class CUdevSmResourceGroup_flags(IntEnum):
+    """
+
+    """
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_DEFAULT' in found_values}}
+    CU_DEV_SM_RESOURCE_GROUP_DEFAULT = cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_DEFAULT{{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_BACKFILL' in found_values}}
+    CU_DEV_SM_RESOURCE_GROUP_BACKFILL = cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL{{endif}}
+
+_dict_CUdevSmResourceGroup_flags = dict(((int(v), v) for k, v in CUdevSmResourceGroup_flags.__members__.items()))
+{{endif}}
+{{if 'CUdevSmResourceSplitByCount_flags' in found_types}}
 
-class CUdevSmResourceSplit_flags(IntEnum):
+class CUdevSmResourceSplitByCount_flags(IntEnum):
     """
 
     """
     {{if 'CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING' in found_values}}
-    CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = cydriver.CUdevSmResourceSplit_flags.CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING{{endif}}
+    CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = cydriver.CUdevSmResourceSplitByCount_flags.CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING{{endif}}
     {{if 'CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE' in found_values}}
-    CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = cydriver.CUdevSmResourceSplit_flags.CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE{{endif}}
+    CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = cydriver.CUdevSmResourceSplitByCount_flags.CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE{{endif}}
 
-_dict_CUdevSmResourceSplit_flags = dict(((int(v), v) for k, v in CUdevSmResourceSplit_flags.__members__.items()))
+_dict_CUdevSmResourceSplitByCount_flags = dict(((int(v), v) for k, v in CUdevSmResourceSplitByCount_flags.__members__.items()))
 {{endif}}
 {{if 'CUdevResourceType' in found_types}}
 
@@ -5395,9 +5529,36 @@ class CUdevResourceType(IntEnum):
 
     #: Streaming multiprocessors related information
     CU_DEV_RESOURCE_TYPE_SM = cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM{{endif}}
+    {{if 'CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG' in found_values}}
+
+    #: Workqueue configuration related information
+    CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG = cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG{{endif}}
+    {{if 'CU_DEV_RESOURCE_TYPE_WORKQUEUE' in found_values}}
+
+    #: Pre-existing workqueue related information
+    CU_DEV_RESOURCE_TYPE_WORKQUEUE = cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE{{endif}}
 
 _dict_CUdevResourceType = dict(((int(v), v) for k, v in CUdevResourceType.__members__.items()))
 {{endif}}
+{{if 'CUdevWorkqueueConfigScope' in found_types}}
+
+class CUdevWorkqueueConfigScope(IntEnum):
+    """
+    Sharing scope for workqueues
+    """
+    {{if 'CU_WORKQUEUE_SCOPE_DEVICE_CTX' in found_values}}
+
+    #: Use all shared workqueue resources across all contexts. Default
+    #: driver behaviour.
+    CU_WORKQUEUE_SCOPE_DEVICE_CTX = cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_DEVICE_CTX{{endif}}
+    {{if 'CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED' in found_values}}
+
+    #: When possible, use non-overlapping workqueue resources with other
+    #: balanced green contexts.
+    CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED = cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED{{endif}}
+
+_dict_CUdevWorkqueueConfigScope = dict(((int(v), v) for k, v in CUdevWorkqueueConfigScope.__members__.items()))
+{{endif}}
 {{if 'CUlogLevel_enum' in found_types}}
 
 class CUlogLevel(IntEnum):
@@ -8642,6 +8803,205 @@ cdef class CUstreamMemOpMemoryBarrierParams_st:
         self._pvt_ptr[0].memoryBarrier.flags = flags
     {{endif}}
 {{endif}}
+{{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+
+cdef class CUstreamMemOpAtomicReductionParams_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
+    operation : CUstreamBatchMemOpType
+
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
+    flags : unsigned int
+        Must be 0
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
+    reductionOp : CUstreamAtomicReductionOpType
+        See CUstreamAtomicReductionOpType
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
+    dataType : CUstreamAtomicReductionDataType
+        See CUstreamAtomicReductionDataType
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+    address : CUdeviceptr
+        The address the atomic operation will be operated on
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+    value : cuuint64_t
+        The operand value the atomic operation will operate with
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+    alias : CUdeviceptr
+        For driver internal use. Initial value is unimportant.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr):
+        self._pvt_ptr = <cydriver.CUstreamBatchMemOpParams_union *>_ptr
+
+    def __init__(self, void_ptr _ptr):
+        pass
+        {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+        self._address = CUdeviceptr(_ptr=<void_ptr>&self._pvt_ptr[0].atomicReduction.address)
+        {{endif}}
+        {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+        self._value = cuuint64_t(_ptr=<void_ptr>&self._pvt_ptr[0].atomicReduction.value)
+        {{endif}}
+        {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+        self._alias = CUdeviceptr(_ptr=<void_ptr>&self._pvt_ptr[0].atomicReduction.alias)
+        {{endif}}
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>&self._pvt_ptr[0].atomicReduction
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
+            try:
+                str_list += ['operation : ' + str(self.operation)]
+            except ValueError:
+                str_list += ['operation : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
+            try:
+                str_list += ['reductionOp : ' + str(self.reductionOp)]
+            except ValueError:
+                str_list += ['reductionOp : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
+            try:
+                str_list += ['dataType : ' + str(self.dataType)]
+            except ValueError:
+                str_list += ['dataType : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+            try:
+                str_list += ['address : ' + str(self.address)]
+            except ValueError:
+                str_list += ['address : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+            try:
+                str_list += ['value : ' + str(self.value)]
+            except ValueError:
+                str_list += ['value : <ValueError>']
+            {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+            try:
+                str_list += ['alias : ' + str(self.alias)]
+            except ValueError:
+                str_list += ['alias : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
+    @property
+    def operation(self):
+        if self._pvt_ptr[0].atomicReduction.operation not in _dict_CUstreamBatchMemOpType:
+            return None
+        return _dict_CUstreamBatchMemOpType[self._pvt_ptr[0].atomicReduction.operation]
+    @operation.setter
+    def operation(self, operation not None : CUstreamBatchMemOpType):
+        self._pvt_ptr[0].atomicReduction.operation = operation.value
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].atomicReduction.flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].atomicReduction.flags = flags
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
+    @property
+    def reductionOp(self):
+        if self._pvt_ptr[0].atomicReduction.reductionOp not in _dict_CUstreamAtomicReductionOpType:
+            return None
+        return _dict_CUstreamAtomicReductionOpType[self._pvt_ptr[0].atomicReduction.reductionOp]
+    @reductionOp.setter
+    def reductionOp(self, reductionOp not None : CUstreamAtomicReductionOpType):
+        self._pvt_ptr[0].atomicReduction.reductionOp = reductionOp.value
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
+    @property
+    def dataType(self):
+        if self._pvt_ptr[0].atomicReduction.dataType not in _dict_CUstreamAtomicReductionDataType:
+            return None
+        return _dict_CUstreamAtomicReductionDataType[self._pvt_ptr[0].atomicReduction.dataType]
+    @dataType.setter
+    def dataType(self, dataType not None : CUstreamAtomicReductionDataType):
+        self._pvt_ptr[0].atomicReduction.dataType = dataType.value
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
+    @property
+    def address(self):
+        return self._address
+    @address.setter
+    def address(self, address):
+        cdef cydriver.CUdeviceptr cyaddress
+        if address is None:
+            cyaddress = <cydriver.CUdeviceptr><void_ptr>0
+        elif isinstance(address, (CUdeviceptr)):
+            paddress = int(address)
+            cyaddress = <cydriver.CUdeviceptr><void_ptr>paddress
+        else:
+            paddress = int(CUdeviceptr(address))
+            cyaddress = <cydriver.CUdeviceptr><void_ptr>paddress
+        self._address._pvt_ptr[0] = cyaddress
+
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
+    @property
+    def value(self):
+        return self._value
+    @value.setter
+    def value(self, value):
+        cdef cydriver.cuuint64_t cyvalue
+        if value is None:
+            cyvalue = <cydriver.cuuint64_t><void_ptr>0
+        elif isinstance(value, (cuuint64_t)):
+            pvalue = int(value)
+            cyvalue = <cydriver.cuuint64_t><void_ptr>pvalue
+        else:
+            pvalue = int(cuuint64_t(value))
+            cyvalue = <cydriver.cuuint64_t><void_ptr>pvalue
+        self._value._pvt_ptr[0] = cyvalue
+
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
+    @property
+    def alias(self):
+        return self._alias
+    @alias.setter
+    def alias(self, alias):
+        cdef cydriver.CUdeviceptr cyalias
+        if alias is None:
+            cyalias = <cydriver.CUdeviceptr><void_ptr>0
+        elif isinstance(alias, (CUdeviceptr)):
+            palias = int(alias)
+            cyalias = <cydriver.CUdeviceptr><void_ptr>palias
+        else:
+            palias = int(CUdeviceptr(alias))
+            cyalias = <cydriver.CUdeviceptr><void_ptr>palias
+        self._alias._pvt_ptr[0] = cyalias
+
+    {{endif}}
+{{endif}}
 {{if 'CUstreamBatchMemOpParams_union' in found_struct}}
 
 cdef class CUstreamBatchMemOpParams_union:
@@ -8672,6 +9032,10 @@ cdef class CUstreamBatchMemOpParams_union:
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
         Params for CU_STREAM_MEM_OP_BARRIER operations.
+    {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : list[cuuint64_t]
@@ -8702,6 +9066,9 @@ cdef class CUstreamBatchMemOpParams_union:
         {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
         self._memoryBarrier = CUstreamMemOpMemoryBarrierParams_st(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
+        {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+        self._atomicReduction = CUstreamMemOpAtomicReductionParams_st(_ptr=<void_ptr>self._pvt_ptr)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -8739,6 +9106,12 @@ cdef class CUstreamBatchMemOpParams_union:
             except ValueError:
                 str_list += ['memoryBarrier : <ValueError>']
             {{endif}}
+            {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+            try:
+                str_list += ['atomicReduction :\n' + '\n'.join(['    ' + line for line in str(self.atomicReduction).splitlines()])]
+            except ValueError:
+                str_list += ['atomicReduction : <ValueError>']
+            {{endif}}
             {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
             try:
                 str_list += ['pad : ' + str(self.pad)]
@@ -8790,6 +9163,14 @@ cdef class CUstreamBatchMemOpParams_union:
     def memoryBarrier(self, memoryBarrier not None : CUstreamMemOpMemoryBarrierParams_st):
         string.memcpy(&self._pvt_ptr[0].memoryBarrier, <cydriver.CUstreamMemOpMemoryBarrierParams_st*><void_ptr>memoryBarrier.getPtr(), sizeof(self._pvt_ptr[0].memoryBarrier))
     {{endif}}
+    {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
+    @property
+    def atomicReduction(self):
+        return self._atomicReduction
+    @atomicReduction.setter
+    def atomicReduction(self, atomicReduction not None : CUstreamMemOpAtomicReductionParams_st):
+        string.memcpy(&self._pvt_ptr[0].atomicReduction, <cydriver.CUstreamMemOpAtomicReductionParams_st*><void_ptr>atomicReduction.getPtr(), sizeof(self._pvt_ptr[0].atomicReduction))
+    {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     @property
     def pad(self):
@@ -22444,20 +22825,23 @@ cdef class CUdevSmResource_st:
     {{if 'CUdevSmResource_st.smCount' in found_struct}}
     smCount : unsigned int
         The amount of streaming multiprocessors available in this resource.
-        This is an output parameter only, do not write to this field.
     {{endif}}
     {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
     minSmPartitionSize : unsigned int
         The minimum number of streaming multiprocessors required to
-        partition this resource. This is an output parameter only, do not
-        write to this field.
+        partition this resource.
     {{endif}}
     {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
     smCoscheduledAlignment : unsigned int
         The number of streaming multiprocessors in this resource that are
         guaranteed to be co-scheduled on the same GPU processing cluster.
-        smCount is a multiple of this value. This is an output parameter
-        only, do not write to this field.
+        smCount will be a multiple of this value, unless the backfill flag
+        is set.
+    {{endif}}
+    {{if 'CUdevSmResource_st.flags' in found_struct}}
+    flags : unsigned int
+        The flags set on this SM resource. For possible values see
+        ::CUdevSmResourceGroup_flags.
     {{endif}}
 
     Methods
@@ -22497,6 +22881,12 @@ cdef class CUdevSmResource_st:
             except ValueError:
                 str_list += ['smCoscheduledAlignment : <ValueError>']
             {{endif}}
+            {{if 'CUdevSmResource_st.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -22524,6 +22914,287 @@ cdef class CUdevSmResource_st:
     def smCoscheduledAlignment(self, unsigned int smCoscheduledAlignment):
         self._pvt_ptr[0].smCoscheduledAlignment = smCoscheduledAlignment
     {{endif}}
+    {{if 'CUdevSmResource_st.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
+{{endif}}
+{{if 'CUdevWorkqueueConfigResource_st' in found_struct}}
+
+cdef class CUdevWorkqueueConfigResource_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+    device : CUdevice
+        The device on which the workqueue resources are available
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
+    wqConcurrencyLimit : unsigned int
+        The expected maximum number of concurrent stream-ordered workloads
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
+    sharingScope : CUdevWorkqueueConfigScope
+        The sharing scope for the workqueue resources
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cydriver.CUdevWorkqueueConfigResource_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+        {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+        self._device = CUdevice(_ptr=<void_ptr>&self._pvt_ptr[0].device)
+        {{endif}}
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+            try:
+                str_list += ['device : ' + str(self.device)]
+            except ValueError:
+                str_list += ['device : <ValueError>']
+            {{endif}}
+            {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
+            try:
+                str_list += ['wqConcurrencyLimit : ' + str(self.wqConcurrencyLimit)]
+            except ValueError:
+                str_list += ['wqConcurrencyLimit : <ValueError>']
+            {{endif}}
+            {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
+            try:
+                str_list += ['sharingScope : ' + str(self.sharingScope)]
+            except ValueError:
+                str_list += ['sharingScope : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
+    @property
+    def device(self):
+        return self._device
+    @device.setter
+    def device(self, device):
+        cdef cydriver.CUdevice cydevice
+        if device is None:
+            cydevice = <cydriver.CUdevice><void_ptr>0
+        elif isinstance(device, (CUdevice)):
+            pdevice = int(device)
+            cydevice = <cydriver.CUdevice><void_ptr>pdevice
+        else:
+            pdevice = int(CUdevice(device))
+            cydevice = <cydriver.CUdevice><void_ptr>pdevice
+        self._device._pvt_ptr[0] = cydevice
+
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
+    @property
+    def wqConcurrencyLimit(self):
+        return self._pvt_ptr[0].wqConcurrencyLimit
+    @wqConcurrencyLimit.setter
+    def wqConcurrencyLimit(self, unsigned int wqConcurrencyLimit):
+        self._pvt_ptr[0].wqConcurrencyLimit = wqConcurrencyLimit
+    {{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
+    @property
+    def sharingScope(self):
+        if self._pvt_ptr[0].sharingScope not in _dict_CUdevWorkqueueConfigScope:
+            return None
+        return _dict_CUdevWorkqueueConfigScope[self._pvt_ptr[0].sharingScope]
+    @sharingScope.setter
+    def sharingScope(self, sharingScope not None : CUdevWorkqueueConfigScope):
+        self._pvt_ptr[0].sharingScope = sharingScope.value
+    {{endif}}
+{{endif}}
+{{if 'CUdevWorkqueueResource_st' in found_struct}}
+
+cdef class CUdevWorkqueueResource_st:
+    """
+    Attributes
+    ----------
+    {{if 'CUdevWorkqueueResource_st.reserved' in found_struct}}
+    reserved : bytes
+        Reserved for future use
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cydriver.CUdevWorkqueueResource_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CUdevWorkqueueResource_st.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CUdevWorkqueueResource_st.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0].reserved, 40)
+    @reserved.setter
+    def reserved(self, reserved):
+        if len(reserved) != 40:
+            raise ValueError("reserved length must be 40, is " + str(len(reserved)))
+        for i, b in enumerate(reserved):
+            self._pvt_ptr[0].reserved[i] = b
+    {{endif}}
+{{endif}}
+{{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st' in found_struct}}
+
+cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
+    """
+    Attributes
+    ----------
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `CUdevSmResourceGroup_flags` values to indicate this
+        this group is created.
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.smCount' in found_struct}}
+            try:
+                str_list += ['smCount : ' + str(self.smCount)]
+            except ValueError:
+                str_list += ['smCount : <ValueError>']
+            {{endif}}
+            {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.coscheduledSmCount' in found_struct}}
+            try:
+                str_list += ['coscheduledSmCount : ' + str(self.coscheduledSmCount)]
+            except ValueError:
+                str_list += ['coscheduledSmCount : <ValueError>']
+            {{endif}}
+            {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.preferredCoscheduledSmCount' in found_struct}}
+            try:
+                str_list += ['preferredCoscheduledSmCount : ' + str(self.preferredCoscheduledSmCount)]
+            except ValueError:
+                str_list += ['preferredCoscheduledSmCount : <ValueError>']
+            {{endif}}
+            {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
+            {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.smCount' in found_struct}}
+    @property
+    def smCount(self):
+        return self._pvt_ptr[0].smCount
+    @smCount.setter
+    def smCount(self, unsigned int smCount):
+        self._pvt_ptr[0].smCount = smCount
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.coscheduledSmCount' in found_struct}}
+    @property
+    def coscheduledSmCount(self):
+        return self._pvt_ptr[0].coscheduledSmCount
+    @coscheduledSmCount.setter
+    def coscheduledSmCount(self, unsigned int coscheduledSmCount):
+        self._pvt_ptr[0].coscheduledSmCount = coscheduledSmCount
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.preferredCoscheduledSmCount' in found_struct}}
+    @property
+    def preferredCoscheduledSmCount(self):
+        return self._pvt_ptr[0].preferredCoscheduledSmCount
+    @preferredCoscheduledSmCount.setter
+    def preferredCoscheduledSmCount(self, unsigned int preferredCoscheduledSmCount):
+        self._pvt_ptr[0].preferredCoscheduledSmCount = preferredCoscheduledSmCount
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'CUdevResource_st' in found_struct}}
 
@@ -22543,9 +23214,23 @@ cdef class CUdevResource_st:
     sm : CUdevSmResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    wqConfig : CUdevWorkqueueConfigResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
+        `typename`.
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    wq : CUdevWorkqueueResource
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
+        `typename`.
+    {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
+    {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    nextResource : CUdevResource_st
+
     {{endif}}
 
     Methods
@@ -22564,9 +23249,19 @@ cdef class CUdevResource_st:
         {{if 'CUdevResource_st.sm' in found_struct}}
         self._sm = CUdevSmResource(_ptr=<void_ptr>&self._pvt_ptr[0].sm)
         {{endif}}
+        {{if 'CUdevResource_st.wqConfig' in found_struct}}
+        self._wqConfig = CUdevWorkqueueConfigResource(_ptr=<void_ptr>&self._pvt_ptr[0].wqConfig)
+        {{endif}}
+        {{if 'CUdevResource_st.wq' in found_struct}}
+        self._wq = CUdevWorkqueueResource(_ptr=<void_ptr>&self._pvt_ptr[0].wq)
+        {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
             free(self._val_ptr)
+        {{if 'CUdevResource_st.nextResource' in found_struct}}
+        if self._nextResource is not NULL:
+            free(self._nextResource)
+        {{endif}}
     def getPtr(self):
         return <void_ptr>self._pvt_ptr
     def __repr__(self):
@@ -22590,12 +23285,30 @@ cdef class CUdevResource_st:
             except ValueError:
                 str_list += ['sm : <ValueError>']
             {{endif}}
+            {{if 'CUdevResource_st.wqConfig' in found_struct}}
+            try:
+                str_list += ['wqConfig :\n' + '\n'.join(['    ' + line for line in str(self.wqConfig).splitlines()])]
+            except ValueError:
+                str_list += ['wqConfig : <ValueError>']
+            {{endif}}
+            {{if 'CUdevResource_st.wq' in found_struct}}
+            try:
+                str_list += ['wq :\n' + '\n'.join(['    ' + line for line in str(self.wq).splitlines()])]
+            except ValueError:
+                str_list += ['wq : <ValueError>']
+            {{endif}}
             {{if 'CUdevResource_st._oversize' in found_struct}}
             try:
                 str_list += ['_oversize : ' + str(self._oversize)]
             except ValueError:
                 str_list += ['_oversize : <ValueError>']
             {{endif}}
+            {{if 'CUdevResource_st.nextResource' in found_struct}}
+            try:
+                str_list += ['nextResource : ' + str(self.nextResource)]
+            except ValueError:
+                str_list += ['nextResource : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -22628,17 +23341,56 @@ cdef class CUdevResource_st:
     def sm(self, sm not None : CUdevSmResource):
         string.memcpy(&self._pvt_ptr[0].sm, <cydriver.CUdevSmResource*><void_ptr>sm.getPtr(), sizeof(self._pvt_ptr[0].sm))
     {{endif}}
+    {{if 'CUdevResource_st.wqConfig' in found_struct}}
+    @property
+    def wqConfig(self):
+        return self._wqConfig
+    @wqConfig.setter
+    def wqConfig(self, wqConfig not None : CUdevWorkqueueConfigResource):
+        string.memcpy(&self._pvt_ptr[0].wqConfig, <cydriver.CUdevWorkqueueConfigResource*><void_ptr>wqConfig.getPtr(), sizeof(self._pvt_ptr[0].wqConfig))
+    {{endif}}
+    {{if 'CUdevResource_st.wq' in found_struct}}
+    @property
+    def wq(self):
+        return self._wq
+    @wq.setter
+    def wq(self, wq not None : CUdevWorkqueueResource):
+        string.memcpy(&self._pvt_ptr[0].wq, <cydriver.CUdevWorkqueueResource*><void_ptr>wq.getPtr(), sizeof(self._pvt_ptr[0].wq))
+    {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     @property
     def _oversize(self):
-        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0]._oversize, 48)
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0]._oversize, 40)
     @_oversize.setter
     def _oversize(self, _oversize):
-        if len(_oversize) != 48:
-            raise ValueError("_oversize length must be 48, is " + str(len(_oversize)))
+        if len(_oversize) != 40:
+            raise ValueError("_oversize length must be 40, is " + str(len(_oversize)))
         for i, b in enumerate(_oversize):
             self._pvt_ptr[0]._oversize[i] = b
     {{endif}}
+    {{if 'CUdevResource_st.nextResource' in found_struct}}
+    @property
+    def nextResource(self):
+        arrs = [<void_ptr>self._pvt_ptr[0].nextResource + x*sizeof(cydriver.CUdevResource_st) for x in range(self._nextResource_length)]
+        return [CUdevResource_st(_ptr=arr) for arr in arrs]
+    @nextResource.setter
+    def nextResource(self, val):
+        if len(val) == 0:
+            free(self._nextResource)
+            self._nextResource_length = 0
+            self._pvt_ptr[0].nextResource = NULL
+        else:
+            if self._nextResource_length != <size_t>len(val):
+                free(self._nextResource)
+                self._nextResource = <cydriver.CUdevResource_st*> calloc(len(val), sizeof(cydriver.CUdevResource_st))
+                if self._nextResource is NULL:
+                    raise MemoryError('Failed to allocate length x size memory: ' + str(len(val)) + 'x' + str(sizeof(cydriver.CUdevResource_st)))
+                self._nextResource_length = <size_t>len(val)
+                self._pvt_ptr[0].nextResource = self._nextResource
+            for idx in range(len(val)):
+                string.memcpy(&self._nextResource[idx], (<CUdevResource_st>val[idx])._pvt_ptr, sizeof(cydriver.CUdevResource_st))
+
+    {{endif}}
 {{endif}}
 {{if True}}
 
@@ -25243,7 +25995,7 @@ def cuCtxSynchronize():
     Returns
     -------
     CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
     --------
@@ -25280,7 +26032,7 @@ def cuCtxSynchronize_v2(ctx):
     Returns
     -------
     CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
     --------
@@ -26038,7 +26790,7 @@ def cuModuleLoad(char* fname):
     allocated, :py:obj:`~.cuModuleLoad()` fails. The file should be a
     `cubin` file as output by nvcc, or a `PTX` file either as output by
     nvcc or handwritten, or a `fatbin` file as output by nvcc from
-    toolchain 4.0 or later.
+    toolchain 4.0 or later, or a `Tile` IR file.
 
     Parameters
     ----------
@@ -26073,7 +26825,7 @@ def cuModuleLoadData(image):
     Takes a pointer `image` and loads the corresponding module `module`
     into the current context. The `image` may be a `cubin` or `fatbin` as
     output by nvcc, or a NULL-terminated `PTX`, either as output by nvcc or
-    hand-written.
+    hand-written, or `Tile` IR data.
 
     Parameters
     ----------
@@ -26110,7 +26862,7 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[
     Takes a pointer `image` and loads the corresponding module `module`
     into the current context. The `image` may be a `cubin` or `fatbin` as
     output by nvcc, or a NULL-terminated `PTX`, either as output by nvcc or
-    hand-written.
+    hand-written, or `Tile` IR data.
 
     Parameters
     ----------
@@ -26869,8 +27621,9 @@ def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUj
     under the "CUDA environment variables" section.
 
     The `code` may be a `cubin` or `fatbin` as output by nvcc, or a NULL-
-    terminated `PTX`, either as output by nvcc or hand-written. A fatbin
-    should also contain relocatable code when doing separate compilation.
+    terminated `PTX`, either as output by nvcc or hand-written, or `Tile`
+    IR data. A fatbin should also contain relocatable code when doing
+    separate compilation.
 
     Options are passed as an array via `jitOptions` and any corresponding
     parameters are passed in `jitOptionsValues`. The number of total JIT
@@ -26968,8 +27721,8 @@ def cuLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[CUjit_opti
 
     The file should be a `cubin` file as output by nvcc, or a `PTX` file
     either as output by nvcc or handwritten, or a `fatbin` file as output
-    by nvcc. A fatbin should also contain relocatable code when doing
-    separate compilation.
+    by nvcc or hand-written, or `Tile` IR file. A fatbin should also
+    contain relocatable code when doing separate compilation.
 
     Options are passed as an array via `jitOptions` and any corresponding
     parameters are passed in `jitOptionsValues`. The number of total
@@ -28117,9 +28870,9 @@ def cuMemGetAddressRange(dptr):
     """ Get information on memory allocations.
 
     Returns the base address in `*pbase` and size in `*psize` of the
-    allocation by :py:obj:`~.cuMemAlloc()` or :py:obj:`~.cuMemAllocPitch()`
-    that contains the input pointer `dptr`. Both parameters `pbase` and
-    `psize` are optional. If one of them is NULL, it is ignored.
+    allocation that contains the input pointer `dptr`. Both parameters
+    `pbase` and `psize` are optional. If one of them is NULL, it is
+    ignored.
 
     Parameters
     ----------
@@ -34111,6 +34864,21 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     /proc/devices users can execute the following command: `mknod
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
+    To create a managed memory pool, applications must set
+    :py:obj:`~.CUmemPoolProps`::CUmemAllocationType to
+    CU_MEM_ALLOCATION_TYPE_MANAGED.
+    :py:obj:`~.CUmemPoolProps`::CUmemAllocationHandleType must also be set
+    to CU_MEM_HANDLE_TYPE_NONE since IPC is not supported. For managed
+    memory pools, :py:obj:`~.CUmemPoolProps`::CUmemLocation will be treated
+    as the preferred location for all allocations created from the pool. An
+    application can also set CU_MEM_LOCATION_TYPE_NONE to indicate no
+    preferred location. :py:obj:`~.CUmemPoolProps.maxSize` must be set to
+    zero for managed memory pools. :py:obj:`~.CUmemPoolProps.usage` should
+    be zero as decompress for managed memory is not supported. For managed
+    memory pools, all devices on the system must have non-zero
+    :py:obj:`~.concurrentManagedAccess`. If not, this call returns
+    CUDA_ERROR_NOT_SUPPORTED
+
     Parameters
     ----------
     poolProps : :py:obj:`~.CUmemPoolProps`
@@ -34623,10 +35391,10 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     the multicast object via :py:obj:`~.cuMulticastAddDevice`. All
     participating devices must be added to the multicast object before
     memory can be bound to it. Memory is bound to the multicast object via
-    either :py:obj:`~.cuMulticastBindMem` or
-    :py:obj:`~.cuMulticastBindAddr`, and can be unbound via
-    :py:obj:`~.cuMulticastUnbind`. The total amount of memory that can be
-    bound per device is specified by
+    :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindMem_v2`,
+    :py:obj:`~.cuMulticastBindAddr`, or :py:obj:`~.cuMulticastBindAddr_v2`.
+    and can be unbound via :py:obj:`~.cuMulticastUnbind`. The total amount
+    of memory that can be bound per device is specified by
     :py:obj:`~.py`:obj:`~.CUmulticastObjectProp.size`. This size must be a
     multiple of the value returned by :py:obj:`~.cuMulticastGetGranularity`
     with the flag :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best
@@ -34662,6 +35430,8 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     :py:obj:`~.cuMulticastAddDevice`, :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`, :py:obj:`~.cuMulticastUnbind`
 
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
+
+    :py:obj:`~.cuMulticastBindAddr_v2`, :py:obj:`~.cuMulticastBindMem_v2`
     """
     cdef CUmemGenericAllocationHandle mcHandle = CUmemGenericAllocationHandle()
     cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
@@ -34685,10 +35455,11 @@ def cuMulticastAddDevice(mcHandle, dev):
     multicast object is permanent during the life time of the multicast
     object. All devices must be added to the multicast team before any
     memory can be bound to any device in the team. Any calls to
-    :py:obj:`~.cuMulticastBindMem` or :py:obj:`~.cuMulticastBindAddr` will
-    block until all devices have been added. Similarly all devices must be
-    added to the multicast team before a virtual address range can be
-    mapped to the multicast object. A call to :py:obj:`~.cuMemMap` will
+    :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindMem_v2`,
+    :py:obj:`~.cuMulticastBindAddr`, or :py:obj:`~.cuMulticastBindAddr_v2`
+    will block until all devices have been added. Similarly all devices
+    must be added to the multicast team before a virtual address range can
+    be mapped to the multicast object. A call to :py:obj:`~.cuMemMap` will
     block until all devices have been added.
 
     Parameters
@@ -34786,6 +35557,8 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
     See Also
     --------
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastAddDevice`, :py:obj:`~.cuMemCreate`
+
+    :py:obj:`~.cuMulticastBindMem_v2`
     """
     cdef cydriver.CUmemGenericAllocationHandle cymemHandle
     if memHandle is None:
@@ -34808,6 +35581,107 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
     return (_dict_CUresult[err],)
 {{endif}}
 
+{{if 'cuMulticastBindMem_v2' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memOffset, size_t size, unsigned long long flags):
+    """ Bind a memory allocation represented by a handle to a multicast object.
+
+    Binds a memory allocation specified by `memHandle` and created via
+    :py:obj:`~.cuMemCreate` to a multicast object represented by `mcHandle`
+    and created via :py:obj:`~.cuMulticastCreate`. The binding will be
+    applicable for the device `dev`. The intended `size` of the bind, the
+    offset in the multicast range `mcOffset` as well as the offset in the
+    memory `memOffset` must be a multiple of the value returned by
+    :py:obj:`~.cuMulticastGetGranularity` with the flag
+    :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
+    however, `size`, `mcOffset` and `memOffset` should be aligned to the
+    granularity of the memory allocation(see
+    :py:obj:`~.cuMemGetAllocationGranularity`) or to the value returned by
+    :py:obj:`~.cuMulticastGetGranularity` with the flag
+    :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
+
+    The `size` + `memOffset` cannot be larger than the size of the
+    allocated memory. Similarly the `size` + `mcOffset` cannot be larger
+    than the size of the multicast object. The memory allocation must have
+    beeen created on one of the devices that was added to the multicast
+    team via :py:obj:`~.cuMulticastAddDevice`. For device memory, i.e.,
+    type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, the memory allocation
+    must have been created on the device specified by `dev`. For host NUMA
+    memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the
+    memory allocation must have been created on the CPU NUMA node closest
+    to `dev`. That is, the value returned when querying
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
+    NUMA node where the memory was allocated. In both cases, the device
+    named by `dev` must have been added to the multicast team via
+    :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
+    imported multicast objects can be bound only to externally shareable
+    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
+    there are insufficient resources required to perform the bind. This
+    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
+    system software is not initialized or running.
+
+    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    configuration is in an illegal state. In such cases, to continue using
+    multicast, verify that the system configuration is in a valid state and
+    all required driver daemons are running properly.
+
+    Parameters
+    ----------
+    mcHandle : :py:obj:`~.CUmemGenericAllocationHandle`
+        Handle representing a multicast object.
+    dev : :py:obj:`~.CUdevice`
+        The device that for which the multicast memory binding will be
+        applicable.
+    mcOffset : size_t
+        Offset into the multicast object for attachment.
+    memHandle : :py:obj:`~.CUmemGenericAllocationHandle`
+        Handle representing a memory allocation.
+    memOffset : size_t
+        Offset into the memory for attachment.
+    size : size_t
+        Size of the memory that will be bound to the multicast object.
+    flags : unsigned long long
+        Flags for future use, must be zero for now.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+
+    See Also
+    --------
+    :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastAddDevice`, :py:obj:`~.cuMemCreate`
+    """
+    cdef cydriver.CUmemGenericAllocationHandle cymemHandle
+    if memHandle is None:
+        pmemHandle = 0
+    elif isinstance(memHandle, (CUmemGenericAllocationHandle,)):
+        pmemHandle = int(memHandle)
+    else:
+        pmemHandle = int(CUmemGenericAllocationHandle(memHandle))
+    cymemHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmemHandle
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUmemGenericAllocationHandle cymcHandle
+    if mcHandle is None:
+        pmcHandle = 0
+    elif isinstance(mcHandle, (CUmemGenericAllocationHandle,)):
+        pmcHandle = int(mcHandle)
+    else:
+        pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
+    cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
+    with nogil:
+        err = cydriver.cuMulticastBindMem_v2(cymcHandle, cydev, mcOffset, cymemHandle, memOffset, size, flags)
+    return (_dict_CUresult[err],)
+{{endif}}
+
 {{if 'cuMulticastBindAddr' in found_functions}}
 
 @cython.embedsignature(True)
@@ -34862,6 +35736,8 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
     See Also
     --------
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastAddDevice`, :py:obj:`~.cuMemCreate`
+
+    :py:obj:`~.cuMulticastBindAddr_v2`
     """
     cdef cydriver.CUdeviceptr cymemptr
     if memptr is None:
@@ -34884,6 +35760,101 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
     return (_dict_CUresult[err],)
 {{endif}}
 
+{{if 'cuMulticastBindAddr_v2' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size, unsigned long long flags):
+    """ Bind a memory allocation represented by a virtual address to a multicast object.
+
+    Binds a memory allocation specified by its mapped address `memptr` to a
+    multicast object represented by `mcHandle`. The binding will be
+    applicable for the device `dev`. The memory must have been allocated
+    via :py:obj:`~.cuMemCreate` or :py:obj:`~.cudaMallocAsync`. The
+    intended `size` of the bind, the offset in the multicast range
+    `mcOffset` and `memptr` must be a multiple of the value returned by
+    :py:obj:`~.cuMulticastGetGranularity` with the flag
+    :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
+    however, `size`, `mcOffset` and `memptr` should be aligned to the value
+    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
+
+    The `size` cannot be larger than the size of the allocated memory.
+    Similarly the `size` + `mcOffset` cannot be larger than the total size
+    of the multicast object. For device memory, i.e., type
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, the memory allocation must
+    have been created on the device specified by `dev`. For host NUMA
+    memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the
+    memory allocation must have been created on the CPU NUMA node closest
+    to `dev`. That is, the value returned when querying
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
+    NUMA node where the memory was allocated. In both cases, the device
+    named by `dev` must have been added to the multicast team via
+    :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
+    imported multicast objects can be bound only to externally shareable
+    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
+    there are insufficient resources required to perform the bind. This
+    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
+    system software is not initialized or running.
+
+    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    configuration is in an illegal state. In such cases, to continue using
+    multicast, verify that the system configuration is in a valid state and
+    all required driver daemons are running properly.
+
+    Parameters
+    ----------
+    mcHandle : :py:obj:`~.CUmemGenericAllocationHandle`
+        Handle representing a multicast object.
+    dev : :py:obj:`~.CUdevice`
+        The device that for which the multicast memory binding will be
+        applicable.
+    mcOffset : size_t
+        Offset into multicast va range for attachment.
+    memptr : :py:obj:`~.CUdeviceptr`
+        Virtual address of the memory allocation.
+    size : size_t
+        Size of memory that will be bound to the multicast object.
+    flags : unsigned long long
+        Flags for future use, must be zero now.
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
+
+    See Also
+    --------
+    :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastAddDevice`, :py:obj:`~.cuMemCreate`
+    """
+    cdef cydriver.CUdeviceptr cymemptr
+    if memptr is None:
+        pmemptr = 0
+    elif isinstance(memptr, (CUdeviceptr,)):
+        pmemptr = int(memptr)
+    else:
+        pmemptr = int(CUdeviceptr(memptr))
+    cymemptr = <cydriver.CUdeviceptr><void_ptr>pmemptr
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    cdef cydriver.CUmemGenericAllocationHandle cymcHandle
+    if mcHandle is None:
+        pmcHandle = 0
+    elif isinstance(mcHandle, (CUmemGenericAllocationHandle,)):
+        pmcHandle = int(mcHandle)
+    else:
+        pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
+    cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
+    with nogil:
+        err = cydriver.cuMulticastBindAddr_v2(cymcHandle, cydev, mcOffset, cymemptr, size, flags)
+    return (_dict_CUresult[err],)
+{{endif}}
+
 {{if 'cuMulticastUnbind' in found_functions}}
 
 @cython.embedsignature(True)
@@ -34918,6 +35889,8 @@ def cuMulticastUnbind(mcHandle, dev, size_t mcOffset, size_t size):
     --------
     :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`
 
+    :py:obj:`~.cuMulticastBindMem_v2`, :py:obj:`~.cuMulticastBindAddr_v2`
+
     Notes
     -----
     Warning: The `mcOffset` and the `size` must match the corresponding values specified during the bind call. Any other values may result in undefined behavior.
@@ -34971,6 +35944,8 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     See Also
     --------
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`, :py:obj:`~.cuMulticastUnbind`
+
+    :py:obj:`~.cuMulticastBindMem_v2`, :py:obj:`~.cuMulticastBindAddr_v2`
     """
     cdef size_t granularity = 0
     cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
@@ -35206,7 +36181,8 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
     specifies the destination location. `count` specifies the number of
     bytes to copy. `hStream` is the stream in which the operation is
     enqueued. The memory range must refer to managed memory allocated via
-    :py:obj:`~.cuMemAllocManaged` or declared via managed variables.
+    :py:obj:`~.cuMemAllocManaged`, via :py:obj:`~.cuMemAllocFromPool` from
+    a managed memory pool or declared via managed variables.
 
     Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` for
     :py:obj:`~.CUmemLocation.type` will prefetch memory to GPU specified by
@@ -51000,11 +51976,16 @@ def cuGreenCtxDestroy(hCtx):
     descriptor) are released as well. The API does not destroy streams
     created via :py:obj:`~.cuGreenCtxStreamCreate`,
     :py:obj:`~.cuStreamCreate`, or :py:obj:`~.cuStreamCreateWithPriority`.
-    Once the green context is destroyed, any subsequent API calls involving
-    these streams (including :py:obj:`~.cuStreamDestroy`) will return
-    :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`. Users must explicitly
-    destroy all such streams before invoking :py:obj:`~.cuGreenCtxDestroy`.
-    Failure to do so will result in a memory leak.
+    Users are expected to destroy these streams explicitly using
+    :py:obj:`~.cuStreamDestroy` to avoid resource leaks. Once the green
+    context is destroyed, any subsequent API calls involving these streams
+    will return :py:obj:`~.CUDA_ERROR_STREAM_DETACHED` with the exception
+    of the following APIs:
+
+    - :py:obj:`~.cuStreamDestroy`.
+
+    Additionally, the API will invalidate all active captures on these
+    streams.
 
     Parameters
     ----------
@@ -51220,12 +52201,12 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 {{if 'cuDevSmResourceSplitByCount' in found_functions}}
 
 @cython.embedsignature(True)
-def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int useFlags, unsigned int minCount):
+def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int flags, unsigned int minCount):
     """ Splits `CU_DEV_RESOURCE_TYPE_SM` resources.
 
     Splits `CU_DEV_RESOURCE_TYPE_SM` resources into `nbGroups`, adhering to
     the minimum SM count specified in `minCount` and the usage flags in
-    `useFlags`. If `result` is NULL, the API simulates a split and provides
+    `flags`. If `result` is NULL, the API simulates a split and provides
     the amount of groups that would be created in `nbGroups`. Otherwise,
     `nbGroups` must point to the amount of elements in `result` and on
     return, the API will overwrite `nbGroups` with the amount actually
@@ -51275,13 +52256,13 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
     - A valid array of `result` pointers of size passed in `nbGroups`, with
       `input` of type `CU_DEV_RESOURCE_TYPE_SM`. Value of `minCount` must
-      be between 0 and the SM count specified in `input`. `remaining` may
+      be between 0 and the SM count specified in `input`. `remainder` may
       be NULL.
 
     - NULL passed in for `result`, with a valid integer pointer in
       `nbGroups` and `input` of type `CU_DEV_RESOURCE_TYPE_SM`. Value of
       `minCount` must be between 0 and the SM count specified in `input`.
-      `remaining` may be NULL. This queries the number of groups that would
+      `remainder` may be NULL. This queries the number of groups that would
       be created by the API.
 
     Note: The API is not supported on 32-bit platforms.
@@ -51294,7 +52275,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     input : :py:obj:`~.CUdevResource`
         Input SM resource to be split. Must be a valid
         `CU_DEV_RESOURCE_TYPE_SM` resource.
-    useFlags : unsigned int
+    flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
         behavior.
@@ -51311,9 +52292,9 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
-    remaining : :py:obj:`~.CUdevResource`
+    remainder : :py:obj:`~.CUdevResource`
         If the input resource cannot be cleanly split among `nbGroups`, the
-        remaining is placed in here. Can be ommitted (NULL) if the user
+        remainder is placed in here. Can be ommitted (NULL) if the user
         does not need the remaining set.
 
     See Also
@@ -51328,9 +52309,9 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
             raise MemoryError('Failed to allocate length x size memory: ' + str(nbGroups) + 'x' + str(sizeof(cydriver.CUdevResource)))
     cdef unsigned int cynbGroups = nbGroups
     cdef cydriver.CUdevResource* cyinput__ptr = input_._pvt_ptr if input_ is not None else NULL
-    cdef CUdevResource remaining = CUdevResource()
+    cdef CUdevResource remainder = CUdevResource()
     with nogil:
-        err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
+        err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remainder._pvt_ptr, flags, minCount)
     if CUresult(err) == CUresult(0):
         for idx in range(nbGroups):
             string.memcpy((<CUdevResource>pyresult[idx])._pvt_ptr, &cyresult[idx], sizeof(cydriver.CUdevResource))
@@ -51338,7 +52319,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         free(cyresult)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None, None)
-    return (_dict_CUresult[err], pyresult, cynbGroups, remaining)
+    return (_dict_CUresult[err], pyresult, cynbGroups, remainder)
 {{endif}}
 
 {{if 'cuDevResourceGenerateDesc' in found_functions}}
@@ -51362,7 +52343,8 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
       CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned. If multiple
       resources are provided in `resources` and they are of type
       :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, they must be outputs (whether
-      `result` or `remaining`) from the same split API instance, otherwise
+      `result` or `remaining`) from the same split API instance and have
+      the same smCoscheduledAlignment values, otherwise
       CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
 
     Note: The API is not supported on 32-bit platforms.
@@ -54088,6 +55070,24 @@ def sizeof(objType):
     {{if 'CUdevSmResource' in found_types}}
     if objType == CUdevSmResource:
         return sizeof(cydriver.CUdevSmResource){{endif}}
+    {{if 'CUdevWorkqueueConfigResource_st' in found_struct}}
+    if objType == CUdevWorkqueueConfigResource_st:
+        return sizeof(cydriver.CUdevWorkqueueConfigResource_st){{endif}}
+    {{if 'CUdevWorkqueueConfigResource' in found_types}}
+    if objType == CUdevWorkqueueConfigResource:
+        return sizeof(cydriver.CUdevWorkqueueConfigResource){{endif}}
+    {{if 'CUdevWorkqueueResource_st' in found_struct}}
+    if objType == CUdevWorkqueueResource_st:
+        return sizeof(cydriver.CUdevWorkqueueResource_st){{endif}}
+    {{if 'CUdevWorkqueueResource' in found_types}}
+    if objType == CUdevWorkqueueResource:
+        return sizeof(cydriver.CUdevWorkqueueResource){{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st' in found_struct}}
+    if objType == CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
+        return sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS_st){{endif}}
+    {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS' in found_types}}
+    if objType == CU_DEV_SM_RESOURCE_GROUP_PARAMS:
+        return sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS){{endif}}
     {{if 'CUdevResource_st' in found_struct}}
     if objType == CUdevResource_st:
         return sizeof(cydriver.CUdevResource_st){{endif}}
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 067c3cf4c9..1c9e520aed 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index bf85453459..1a0398bca8 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd.in b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
index e1f0309215..fbda11a161 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 5cb8dadf5c..c0c9022fb7 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 from typing import Any, Optional
 from enum import IntEnum
 import cython
diff --git a/cuda_bindings/cuda/bindings/nvvm.pxd b/cuda_bindings/cuda/bindings/nvvm.pxd
index ece8e75890..c7e4541003 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index 72df74c3a6..a7a9a61152 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.0.2. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index bb5e0906f7..889f4cbaf4 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,12 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
 cimport cuda.bindings.driver as driver
 
+{{if 'cudaDevResourceDesc_t' in found_types}}
+
+cdef class cudaDevResourceDesc_t:
+    """
+
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ::cudaDeviceResourceGenerateDesc
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaDevResourceDesc_t  _pvt_val
+    cdef cyruntime.cudaDevResourceDesc_t* _pvt_ptr
+{{endif}}
+
+{{if 'cudaExecutionContext_t' in found_types}}
+
+cdef class cudaExecutionContext_t:
+    """
+
+    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as cudaGreenCtxCreate.
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaExecutionContext_t  _pvt_val
+    cdef cyruntime.cudaExecutionContext_t* _pvt_ptr
+{{endif}}
+
 {{if 'cudaArray_t' in found_types}}
 
 cdef class cudaArray_t:
@@ -648,9 +682,14 @@ cdef class cudaMemcpyNodeParams:
         Must be zero
     {{endif}}
     {{if 'cudaMemcpyNodeParams.reserved' in found_struct}}
-    reserved : list[int]
+    reserved : int
         Must be zero
     {{endif}}
+    {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the memcpy. If NULL will try to use the
+        current context.
+    {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
     copyParams : cudaMemcpy3DParms
         Parameters for the memory copy
@@ -663,6 +702,9 @@ cdef class cudaMemcpyNodeParams:
     """
     cdef cyruntime.cudaMemcpyNodeParams _pvt_val
     cdef cyruntime.cudaMemcpyNodeParams* _pvt_ptr
+    {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+    cdef cudaExecutionContext_t _ctx
+    {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
     cdef cudaMemcpy3DParms _copyParams
     {{endif}}
@@ -814,6 +856,11 @@ cdef class cudaMemsetParamsV2:
     height : size_t
         Number of rows
     {{endif}}
+    {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the memset. If NULL will try to use the
+        current context.
+    {{endif}}
 
     Methods
     -------
@@ -822,6 +869,9 @@ cdef class cudaMemsetParamsV2:
     """
     cdef cyruntime.cudaMemsetParamsV2 _pvt_val
     cdef cyruntime.cudaMemsetParamsV2* _pvt_ptr
+    {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+    cdef cudaExecutionContext_t _ctx
+    {{endif}}
 {{endif}}
 {{if 'cudaAccessPolicyWindow' in found_struct}}
 
@@ -2871,6 +2921,208 @@ cdef class cudaExternalSemaphoreWaitParams:
     cdef anon_struct15 _params
     {{endif}}
 {{endif}}
+{{if 'cudaDevSmResource' in found_struct}}
+
+cdef class cudaDevSmResource:
+    """
+    Data for SM-related resources All parameters in this structure are
+    OUTPUT only. Do not write to any of the fields in this structure.
+
+    Attributes
+    ----------
+    {{if 'cudaDevSmResource.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of streaming multiprocessors available in this resource.
+    {{endif}}
+    {{if 'cudaDevSmResource.minSmPartitionSize' in found_struct}}
+    minSmPartitionSize : unsigned int
+        The minimum number of streaming multiprocessors required to
+        partition this resource.
+    {{endif}}
+    {{if 'cudaDevSmResource.smCoscheduledAlignment' in found_struct}}
+    smCoscheduledAlignment : unsigned int
+        The number of streaming multiprocessors in this resource that are
+        guaranteed to be co-scheduled on the same GPU processing cluster.
+        smCount will be a multiple of this value, unless the backfill flag
+        is set.
+    {{endif}}
+    {{if 'cudaDevSmResource.flags' in found_struct}}
+    flags : unsigned int
+        The flags set on this SM resource. For available flags see
+        ::cudaDevSmResourceGroup_flags.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaDevSmResource _pvt_val
+    cdef cyruntime.cudaDevSmResource* _pvt_ptr
+{{endif}}
+{{if 'cudaDevWorkqueueConfigResource' in found_struct}}
+
+cdef class cudaDevWorkqueueConfigResource:
+    """
+    Data for workqueue configuration related resources
+
+    Attributes
+    ----------
+    {{if 'cudaDevWorkqueueConfigResource.device' in found_struct}}
+    device : int
+        The device on which the workqueue resources are available
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.wqConcurrencyLimit' in found_struct}}
+    wqConcurrencyLimit : unsigned int
+        The expected maximum number of concurrent stream-ordered workloads
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
+    sharingScope : cudaDevWorkqueueConfigScope
+        The sharing scope for the workqueue resources
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaDevWorkqueueConfigResource _pvt_val
+    cdef cyruntime.cudaDevWorkqueueConfigResource* _pvt_ptr
+{{endif}}
+{{if 'cudaDevWorkqueueResource' in found_struct}}
+
+cdef class cudaDevWorkqueueResource:
+    """
+    Handle to a pre-existing workqueue related resource
+
+    Attributes
+    ----------
+    {{if 'cudaDevWorkqueueResource.reserved' in found_struct}}
+    reserved : bytes
+        Reserved for future use
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaDevWorkqueueResource _pvt_val
+    cdef cyruntime.cudaDevWorkqueueResource* _pvt_ptr
+{{endif}}
+{{if 'cudaDevSmResourceGroupParams_st' in found_struct}}
+
+cdef class cudaDevSmResourceGroupParams_st:
+    """
+    Input data for splitting SMs
+
+    Attributes
+    ----------
+    {{if 'cudaDevSmResourceGroupParams_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        this this group is created.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaDevSmResourceGroupParams_st _pvt_val
+    cdef cyruntime.cudaDevSmResourceGroupParams_st* _pvt_ptr
+{{endif}}
+{{if 'cudaDevResource_st' in found_struct}}
+
+cdef class cudaDevResource_st:
+    """
+    A tagged union describing different resources identified by the
+    type field. This structure should not be directly modified outside
+    of the API that created it. struct enumcudaDevResourceTypetype;
+    union structcudaDevSmResourcesm;
+    structcudaDevWorkqueueConfigResourcewqConfig;
+    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
+    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
+    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
+    the cudaDevSmResource structure `sm` is filled in. For example,
+    `sm.smCount` will reflect the amount of streaming multiprocessors
+    available in this resource.    - If `typename` is
+    `cudaDevResourceTypeWorkqueueConfig`, the
+    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
+    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
+    cudaDevWorkqueueResource structure `wq` is filled in.
+
+    Attributes
+    ----------
+    {{if 'cudaDevResource_st.type' in found_struct}}
+    type : cudaDevResourceType
+        Type of resource, dictates which union field was last set
+    {{endif}}
+    {{if 'cudaDevResource_st._internal_padding' in found_struct}}
+    _internal_padding : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.sm' in found_struct}}
+    sm : cudaDevSmResource
+        Resource corresponding to cudaDevResourceTypeSm `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+    wqConfig : cudaDevWorkqueueConfigResource
+        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
+        `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wq' in found_struct}}
+    wq : cudaDevWorkqueueResource
+        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st._oversize' in found_struct}}
+    _oversize : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.nextResource' in found_struct}}
+    nextResource : cudaDevResource_st
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaDevResource_st* _val_ptr
+    cdef cyruntime.cudaDevResource_st* _pvt_ptr
+    {{if 'cudaDevResource_st.sm' in found_struct}}
+    cdef cudaDevSmResource _sm
+    {{endif}}
+    {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+    cdef cudaDevWorkqueueConfigResource _wqConfig
+    {{endif}}
+    {{if 'cudaDevResource_st.wq' in found_struct}}
+    cdef cudaDevWorkqueueResource _wq
+    {{endif}}
+    {{if 'cudaDevResource_st.nextResource' in found_struct}}
+    cdef size_t _nextResource_length
+    cdef cyruntime.cudaDevResource_st* _nextResource
+    {{endif}}
+{{endif}}
 {{if 'cudalibraryHostUniversalFunctionAndDataTable' in found_struct}}
 
 cdef class cudalibraryHostUniversalFunctionAndDataTable:
@@ -2984,6 +3236,11 @@ cdef class cudaKernelNodeParamsV2:
     extra : Any
         Pointer to kernel arguments in the "extra" format
     {{endif}}
+    {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the kernel. If NULL will try to use the
+        current context.
+    {{endif}}
 
     Methods
     -------
@@ -3001,6 +3258,9 @@ cdef class cudaKernelNodeParamsV2:
     {{if 'cudaKernelNodeParamsV2.kernelParams' in found_struct}}
     cdef _HelperKernelParams _cykernelParams
     {{endif}}
+    {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+    cdef cudaExecutionContext_t _ctx
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalNodeParams' in found_struct}}
 
@@ -3196,6 +3456,10 @@ cdef class cudaConditionalNodeParams:
         executed when the condition is equal to n. If the condition >=
         `size`, no body graph is executed.
     {{endif}}
+    {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        CUDA Execution Context
+    {{endif}}
 
     Methods
     -------
@@ -3211,6 +3475,9 @@ cdef class cudaConditionalNodeParams:
     cdef size_t _phGraph_out_length
     cdef cyruntime.cudaGraph_t* _phGraph_out
     {{endif}}
+    {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+    cdef cudaExecutionContext_t _ctx
+    {{endif}}
 {{endif}}
 {{if 'cudaChildGraphNodeParams' in found_struct}}
 
@@ -3566,7 +3833,7 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union7:
+cdef class anon_union8:
     """
     Attributes
     ----------
@@ -3615,7 +3882,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union7
+    updateData : anon_union8
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -3630,7 +3897,7 @@ cdef class cudaGraphKernelNodeUpdate:
     cdef cudaGraphDeviceNode_t _node
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    cdef anon_union7 _updateData
+    cdef anon_union8 _updateData
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -3987,7 +4254,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union8:
+cdef class anon_union9:
     """
     Attributes
     ----------
@@ -4019,7 +4286,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union8
+    info : anon_union9
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -4032,7 +4299,7 @@ cdef class cudaAsyncNotificationInfo:
     cdef cyruntime.cudaAsyncNotificationInfo* _val_ptr
     cdef cyruntime.cudaAsyncNotificationInfo* _pvt_ptr
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    cdef anon_union8 _info
+    cdef anon_union9 _info
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -4155,7 +4422,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union9:
+cdef class anon_union10:
     """
     Attributes
     ----------
@@ -4191,7 +4458,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union9
+    frame : anon_union10
 
     {{endif}}
     {{if True}}
@@ -4219,7 +4486,7 @@ cdef class cudaEglFrame_st:
     cdef cyruntime.cudaEglFrame_st* _val_ptr
     cdef cyruntime.cudaEglFrame_st* _pvt_ptr
     {{if True}}
-    cdef anon_union9 _frame
+    cdef anon_union10 _frame
     {{endif}}
 {{endif}}
 {{if 'CUuuid' in found_types}}
@@ -4316,6 +4583,104 @@ cdef class cudaMemFabricHandle_t(cudaMemFabricHandle_st):
     """
     pass
 {{endif}}
+{{if 'cudaDevSmResourceGroupParams' in found_types}}
+
+cdef class cudaDevSmResourceGroupParams(cudaDevSmResourceGroupParams_st):
+    """
+    Input data for splitting SMs
+
+    Attributes
+    ----------
+    {{if 'cudaDevSmResourceGroupParams_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        this this group is created.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
+{{if 'cudaDevResource' in found_types}}
+
+cdef class cudaDevResource(cudaDevResource_st):
+    """
+    A tagged union describing different resources identified by the
+    type field. This structure should not be directly modified outside
+    of the API that created it. struct enumcudaDevResourceTypetype;
+    union structcudaDevSmResourcesm;
+    structcudaDevWorkqueueConfigResourcewqConfig;
+    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
+    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
+    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
+    the cudaDevSmResource structure `sm` is filled in. For example,
+    `sm.smCount` will reflect the amount of streaming multiprocessors
+    available in this resource.    - If `typename` is
+    `cudaDevResourceTypeWorkqueueConfig`, the
+    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
+    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
+    cudaDevWorkqueueResource structure `wq` is filled in.
+
+    Attributes
+    ----------
+    {{if 'cudaDevResource_st.type' in found_struct}}
+    type : cudaDevResourceType
+        Type of resource, dictates which union field was last set
+    {{endif}}
+    {{if 'cudaDevResource_st._internal_padding' in found_struct}}
+    _internal_padding : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.sm' in found_struct}}
+    sm : cudaDevSmResource
+        Resource corresponding to cudaDevResourceTypeSm `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+    wqConfig : cudaDevWorkqueueConfigResource
+        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
+        `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wq' in found_struct}}
+    wq : cudaDevWorkqueueResource
+        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st._oversize' in found_struct}}
+    _oversize : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.nextResource' in found_struct}}
+    nextResource : cudaDevResource_st
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    pass
+{{endif}}
 {{if 'cudaGraphEdgeData' in found_types}}
 
 cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
@@ -4497,7 +4862,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union8
+    info : anon_union9
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -4814,7 +5179,7 @@ cdef class cudaEglFrame(cudaEglFrame_st):
     Attributes
     ----------
     {{if True}}
-    frame : anon_union9
+    frame : anon_union10
 
     {{endif}}
     {{if True}}
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 7743b9d610..602b80fd5f 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0. Do not modify it directly.
 from typing import Any, Optional
 from enum import IntEnum
 import cython
@@ -1146,6 +1146,13 @@ class cudaError_t(IntEnum):
     #: This error indicates one or more resources are insufficient or non-
     #: applicable for the operation.
     cudaErrorInvalidResourceConfiguration = cyruntime.cudaError.cudaErrorInvalidResourceConfiguration{{endif}}
+    {{if 'cudaErrorStreamDetached' in found_values}}
+
+    #: This error indicates that the requested operation is not permitted
+    #: because the stream is in a detached state. This can occur if the
+    #: green context associated with the stream has been destroyed,
+    #: limiting the stream's operational capabilities.
+    cudaErrorStreamDetached = cyruntime.cudaError.cudaErrorStreamDetached{{endif}}
     {{if 'cudaErrorUnknown' in found_values}}
 
     #: This indicates that an unknown internal error has occurred.
@@ -4170,6 +4177,74 @@ class cudaExternalSemaphoreHandleType(IntEnum):
 
 _dict_cudaExternalSemaphoreHandleType = dict(((int(v), v) for k, v in cudaExternalSemaphoreHandleType.__members__.items()))
 {{endif}}
+{{if 'cudaDevSmResourceGroup_flags' in found_types}}
+
+class cudaDevSmResourceGroup_flags(IntEnum):
+    """
+
+    """
+    {{if 'cudaDevSmResourceGroupDefault' in found_values}}
+    cudaDevSmResourceGroupDefault = cyruntime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupDefault{{endif}}
+    {{if 'cudaDevSmResourceGroupBackfill' in found_values}}
+    cudaDevSmResourceGroupBackfill = cyruntime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupBackfill{{endif}}
+
+_dict_cudaDevSmResourceGroup_flags = dict(((int(v), v) for k, v in cudaDevSmResourceGroup_flags.__members__.items()))
+{{endif}}
+{{if 'cudaDevSmResourceSplitByCount_flags' in found_types}}
+
+class cudaDevSmResourceSplitByCount_flags(IntEnum):
+    """
+
+    """
+    {{if 'cudaDevSmResourceSplitIgnoreSmCoscheduling' in found_values}}
+    cudaDevSmResourceSplitIgnoreSmCoscheduling = cyruntime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitIgnoreSmCoscheduling{{endif}}
+    {{if 'cudaDevSmResourceSplitMaxPotentialClusterSize' in found_values}}
+    cudaDevSmResourceSplitMaxPotentialClusterSize = cyruntime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitMaxPotentialClusterSize{{endif}}
+
+_dict_cudaDevSmResourceSplitByCount_flags = dict(((int(v), v) for k, v in cudaDevSmResourceSplitByCount_flags.__members__.items()))
+{{endif}}
+{{if 'cudaDevResourceType' in found_types}}
+
+class cudaDevResourceType(IntEnum):
+    """
+    Type of resource
+    """
+    {{if 'cudaDevResourceTypeInvalid' in found_values}}
+    cudaDevResourceTypeInvalid = cyruntime.cudaDevResourceType.cudaDevResourceTypeInvalid{{endif}}
+    {{if 'cudaDevResourceTypeSm' in found_values}}
+
+    #: Streaming multiprocessors related information
+    cudaDevResourceTypeSm = cyruntime.cudaDevResourceType.cudaDevResourceTypeSm{{endif}}
+    {{if 'cudaDevResourceTypeWorkqueueConfig' in found_values}}
+
+    #: Workqueue configuration related information
+    cudaDevResourceTypeWorkqueueConfig = cyruntime.cudaDevResourceType.cudaDevResourceTypeWorkqueueConfig{{endif}}
+    {{if 'cudaDevResourceTypeWorkqueue' in found_values}}
+
+    #: Pre-existing workqueue related information
+    cudaDevResourceTypeWorkqueue = cyruntime.cudaDevResourceType.cudaDevResourceTypeWorkqueue{{endif}}
+
+_dict_cudaDevResourceType = dict(((int(v), v) for k, v in cudaDevResourceType.__members__.items()))
+{{endif}}
+{{if 'cudaDevWorkqueueConfigScope' in found_types}}
+
+class cudaDevWorkqueueConfigScope(IntEnum):
+    """
+    Sharing scope for workqueues
+    """
+    {{if 'cudaDevWorkqueueConfigScopeDeviceCtx' in found_values}}
+
+    #: Use all shared workqueue resources on the device. Default driver
+    #: behaviour.
+    cudaDevWorkqueueConfigScopeDeviceCtx = cyruntime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeDeviceCtx{{endif}}
+    {{if 'cudaDevWorkqueueConfigScopeGreenCtxBalanced' in found_values}}
+
+    #: When possible, use non-overlapping workqueue resources with other
+    #: balanced green contexts.
+    cudaDevWorkqueueConfigScopeGreenCtxBalanced = cyruntime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeGreenCtxBalanced{{endif}}
+
+_dict_cudaDevWorkqueueConfigScope = dict(((int(v), v) for k, v in cudaDevWorkqueueConfigScope.__members__.items()))
+{{endif}}
 {{if 'cudaJitOption' in found_types}}
 
 class cudaJitOption(IntEnum):
@@ -5367,6 +5442,80 @@ class cudaKernelNodeAttrID(IntEnum):
 
 _dict_cudaLaunchAttributeID = dict(((int(v), v) for k, v in cudaLaunchAttributeID.__members__.items()))
 {{endif}}
+{{if 'cudaDevResourceDesc_t' in found_types}}
+
+cdef class cudaDevResourceDesc_t:
+    """
+
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ::cudaDeviceResourceGenerateDesc
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cyruntime.cudaDevResourceDesc_t>init_value
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevResourceDesc_t *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<cudaDevResourceDesc_t ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __eq__(self, other):
+        if not isinstance(other, cudaDevResourceDesc_t):
+            return False
+        return self._pvt_ptr[0] == (<cudaDevResourceDesc_t>other)._pvt_ptr[0]
+    def __hash__(self):
+        return hash(<uintptr_t><void*>(self._pvt_ptr[0]))
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
+{{if 'cudaExecutionContext_t' in found_types}}
+
+cdef class cudaExecutionContext_t:
+    """
+
+    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as cudaGreenCtxCreate.
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cyruntime.cudaExecutionContext_t>init_value
+        else:
+            self._pvt_ptr = <cyruntime.cudaExecutionContext_t *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<cudaExecutionContext_t ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __eq__(self, other):
+        if not isinstance(other, cudaExecutionContext_t):
+            return False
+        return self._pvt_ptr[0] == (<cudaExecutionContext_t>other)._pvt_ptr[0]
+    def __hash__(self):
+        return hash(<uintptr_t><void*>(self._pvt_ptr[0]))
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'cudaArray_t' in found_types}}
 
 cdef class cudaArray_t:
@@ -7039,9 +7188,14 @@ cdef class cudaMemcpyNodeParams:
         Must be zero
     {{endif}}
     {{if 'cudaMemcpyNodeParams.reserved' in found_struct}}
-    reserved : list[int]
+    reserved : int
         Must be zero
     {{endif}}
+    {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the memcpy. If NULL will try to use the
+        current context.
+    {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
     copyParams : cudaMemcpy3DParms
         Parameters for the memory copy
@@ -7059,6 +7213,9 @@ cdef class cudaMemcpyNodeParams:
             self._pvt_ptr = <cyruntime.cudaMemcpyNodeParams *>_ptr
     def __init__(self, void_ptr _ptr = 0):
         pass
+        {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+        self._ctx = cudaExecutionContext_t(_ptr=<void_ptr>&self._pvt_ptr[0].ctx)
+        {{endif}}
         {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
         self._copyParams = cudaMemcpy3DParms(_ptr=<void_ptr>&self._pvt_ptr[0].copyParams)
         {{endif}}
@@ -7081,6 +7238,12 @@ cdef class cudaMemcpyNodeParams:
             except ValueError:
                 str_list += ['reserved : <ValueError>']
             {{endif}}
+            {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+            try:
+                str_list += ['ctx : ' + str(self.ctx)]
+            except ValueError:
+                str_list += ['ctx : <ValueError>']
+            {{endif}}
             {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
             try:
                 str_list += ['copyParams :\n' + '\n'.join(['    ' + line for line in str(self.copyParams).splitlines()])]
@@ -7103,9 +7266,26 @@ cdef class cudaMemcpyNodeParams:
     def reserved(self):
         return self._pvt_ptr[0].reserved
     @reserved.setter
-    def reserved(self, reserved):
+    def reserved(self, int reserved):
         self._pvt_ptr[0].reserved = reserved
     {{endif}}
+    {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
+    @property
+    def ctx(self):
+        return self._ctx
+    @ctx.setter
+    def ctx(self, ctx):
+        cdef cyruntime.cudaExecutionContext_t cyctx
+        if ctx is None:
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>0
+        elif isinstance(ctx, (cudaExecutionContext_t,)):
+            pctx = int(ctx)
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        else:
+            pctx = int(cudaExecutionContext_t(ctx))
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        self._ctx._pvt_ptr[0] = cyctx
+    {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
     @property
     def copyParams(self):
@@ -7521,6 +7701,11 @@ cdef class cudaMemsetParamsV2:
     height : size_t
         Number of rows
     {{endif}}
+    {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the memset. If NULL will try to use the
+        current context.
+    {{endif}}
 
     Methods
     -------
@@ -7534,6 +7719,9 @@ cdef class cudaMemsetParamsV2:
             self._pvt_ptr = <cyruntime.cudaMemsetParamsV2 *>_ptr
     def __init__(self, void_ptr _ptr = 0):
         pass
+        {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+        self._ctx = cudaExecutionContext_t(_ptr=<void_ptr>&self._pvt_ptr[0].ctx)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -7577,6 +7765,12 @@ cdef class cudaMemsetParamsV2:
             except ValueError:
                 str_list += ['height : <ValueError>']
             {{endif}}
+            {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+            try:
+                str_list += ['ctx : ' + str(self.ctx)]
+            except ValueError:
+                str_list += ['ctx : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -7629,6 +7823,23 @@ cdef class cudaMemsetParamsV2:
     def height(self, size_t height):
         self._pvt_ptr[0].height = height
     {{endif}}
+    {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
+    @property
+    def ctx(self):
+        return self._ctx
+    @ctx.setter
+    def ctx(self, ctx):
+        cdef cyruntime.cudaExecutionContext_t cyctx
+        if ctx is None:
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>0
+        elif isinstance(ctx, (cudaExecutionContext_t,)):
+            pctx = int(ctx)
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        else:
+            pctx = int(cudaExecutionContext_t(ctx))
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        self._ctx._pvt_ptr[0] = cyctx
+    {{endif}}
 {{endif}}
 {{if 'cudaAccessPolicyWindow' in found_struct}}
 
@@ -14128,6 +14339,593 @@ cdef class cudaExternalSemaphoreWaitParams:
         self._pvt_ptr[0].reserved = reserved
     {{endif}}
 {{endif}}
+{{if 'cudaDevSmResource' in found_struct}}
+
+cdef class cudaDevSmResource:
+    """
+    Data for SM-related resources All parameters in this structure are
+    OUTPUT only. Do not write to any of the fields in this structure.
+
+    Attributes
+    ----------
+    {{if 'cudaDevSmResource.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of streaming multiprocessors available in this resource.
+    {{endif}}
+    {{if 'cudaDevSmResource.minSmPartitionSize' in found_struct}}
+    minSmPartitionSize : unsigned int
+        The minimum number of streaming multiprocessors required to
+        partition this resource.
+    {{endif}}
+    {{if 'cudaDevSmResource.smCoscheduledAlignment' in found_struct}}
+    smCoscheduledAlignment : unsigned int
+        The number of streaming multiprocessors in this resource that are
+        guaranteed to be co-scheduled on the same GPU processing cluster.
+        smCount will be a multiple of this value, unless the backfill flag
+        is set.
+    {{endif}}
+    {{if 'cudaDevSmResource.flags' in found_struct}}
+    flags : unsigned int
+        The flags set on this SM resource. For available flags see
+        ::cudaDevSmResourceGroup_flags.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevSmResource *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaDevSmResource.smCount' in found_struct}}
+            try:
+                str_list += ['smCount : ' + str(self.smCount)]
+            except ValueError:
+                str_list += ['smCount : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResource.minSmPartitionSize' in found_struct}}
+            try:
+                str_list += ['minSmPartitionSize : ' + str(self.minSmPartitionSize)]
+            except ValueError:
+                str_list += ['minSmPartitionSize : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResource.smCoscheduledAlignment' in found_struct}}
+            try:
+                str_list += ['smCoscheduledAlignment : ' + str(self.smCoscheduledAlignment)]
+            except ValueError:
+                str_list += ['smCoscheduledAlignment : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResource.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaDevSmResource.smCount' in found_struct}}
+    @property
+    def smCount(self):
+        return self._pvt_ptr[0].smCount
+    @smCount.setter
+    def smCount(self, unsigned int smCount):
+        self._pvt_ptr[0].smCount = smCount
+    {{endif}}
+    {{if 'cudaDevSmResource.minSmPartitionSize' in found_struct}}
+    @property
+    def minSmPartitionSize(self):
+        return self._pvt_ptr[0].minSmPartitionSize
+    @minSmPartitionSize.setter
+    def minSmPartitionSize(self, unsigned int minSmPartitionSize):
+        self._pvt_ptr[0].minSmPartitionSize = minSmPartitionSize
+    {{endif}}
+    {{if 'cudaDevSmResource.smCoscheduledAlignment' in found_struct}}
+    @property
+    def smCoscheduledAlignment(self):
+        return self._pvt_ptr[0].smCoscheduledAlignment
+    @smCoscheduledAlignment.setter
+    def smCoscheduledAlignment(self, unsigned int smCoscheduledAlignment):
+        self._pvt_ptr[0].smCoscheduledAlignment = smCoscheduledAlignment
+    {{endif}}
+    {{if 'cudaDevSmResource.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
+{{endif}}
+{{if 'cudaDevWorkqueueConfigResource' in found_struct}}
+
+cdef class cudaDevWorkqueueConfigResource:
+    """
+    Data for workqueue configuration related resources
+
+    Attributes
+    ----------
+    {{if 'cudaDevWorkqueueConfigResource.device' in found_struct}}
+    device : int
+        The device on which the workqueue resources are available
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.wqConcurrencyLimit' in found_struct}}
+    wqConcurrencyLimit : unsigned int
+        The expected maximum number of concurrent stream-ordered workloads
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
+    sharingScope : cudaDevWorkqueueConfigScope
+        The sharing scope for the workqueue resources
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevWorkqueueConfigResource *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaDevWorkqueueConfigResource.device' in found_struct}}
+            try:
+                str_list += ['device : ' + str(self.device)]
+            except ValueError:
+                str_list += ['device : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevWorkqueueConfigResource.wqConcurrencyLimit' in found_struct}}
+            try:
+                str_list += ['wqConcurrencyLimit : ' + str(self.wqConcurrencyLimit)]
+            except ValueError:
+                str_list += ['wqConcurrencyLimit : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
+            try:
+                str_list += ['sharingScope : ' + str(self.sharingScope)]
+            except ValueError:
+                str_list += ['sharingScope : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaDevWorkqueueConfigResource.device' in found_struct}}
+    @property
+    def device(self):
+        return self._pvt_ptr[0].device
+    @device.setter
+    def device(self, int device):
+        self._pvt_ptr[0].device = device
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.wqConcurrencyLimit' in found_struct}}
+    @property
+    def wqConcurrencyLimit(self):
+        return self._pvt_ptr[0].wqConcurrencyLimit
+    @wqConcurrencyLimit.setter
+    def wqConcurrencyLimit(self, unsigned int wqConcurrencyLimit):
+        self._pvt_ptr[0].wqConcurrencyLimit = wqConcurrencyLimit
+    {{endif}}
+    {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
+    @property
+    def sharingScope(self):
+        if self._pvt_ptr[0].sharingScope not in _dict_cudaDevWorkqueueConfigScope:
+            return None
+        return _dict_cudaDevWorkqueueConfigScope[self._pvt_ptr[0].sharingScope]
+    @sharingScope.setter
+    def sharingScope(self, sharingScope not None : cudaDevWorkqueueConfigScope):
+        self._pvt_ptr[0].sharingScope = sharingScope.value
+    {{endif}}
+{{endif}}
+{{if 'cudaDevWorkqueueResource' in found_struct}}
+
+cdef class cudaDevWorkqueueResource:
+    """
+    Handle to a pre-existing workqueue related resource
+
+    Attributes
+    ----------
+    {{if 'cudaDevWorkqueueResource.reserved' in found_struct}}
+    reserved : bytes
+        Reserved for future use
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevWorkqueueResource *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaDevWorkqueueResource.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaDevWorkqueueResource.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0].reserved, 40)
+    @reserved.setter
+    def reserved(self, reserved):
+        if len(reserved) != 40:
+            raise ValueError("reserved length must be 40, is " + str(len(reserved)))
+        for i, b in enumerate(reserved):
+            self._pvt_ptr[0].reserved[i] = b
+    {{endif}}
+{{endif}}
+{{if 'cudaDevSmResourceGroupParams_st' in found_struct}}
+
+cdef class cudaDevSmResourceGroupParams_st:
+    """
+    Input data for splitting SMs
+
+    Attributes
+    ----------
+    {{if 'cudaDevSmResourceGroupParams_st.smCount' in found_struct}}
+    smCount : unsigned int
+        The amount of SMs available in this resource.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.coscheduledSmCount' in found_struct}}
+    coscheduledSmCount : unsigned int
+        The amount of co-scheduled SMs grouped together for locality
+        purposes.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.preferredCoscheduledSmCount' in found_struct}}
+    preferredCoscheduledSmCount : unsigned int
+        When possible, combine co-scheduled groups together into larger
+        groups of this size.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
+    flags : unsigned int
+        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        this this group is created.
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
+    reserved : list[unsigned int]
+        Reserved for future use - ensure this is is zero initialized.
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevSmResourceGroupParams_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaDevSmResourceGroupParams_st.smCount' in found_struct}}
+            try:
+                str_list += ['smCount : ' + str(self.smCount)]
+            except ValueError:
+                str_list += ['smCount : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResourceGroupParams_st.coscheduledSmCount' in found_struct}}
+            try:
+                str_list += ['coscheduledSmCount : ' + str(self.coscheduledSmCount)]
+            except ValueError:
+                str_list += ['coscheduledSmCount : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResourceGroupParams_st.preferredCoscheduledSmCount' in found_struct}}
+            try:
+                str_list += ['preferredCoscheduledSmCount : ' + str(self.preferredCoscheduledSmCount)]
+            except ValueError:
+                str_list += ['preferredCoscheduledSmCount : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaDevSmResourceGroupParams_st.smCount' in found_struct}}
+    @property
+    def smCount(self):
+        return self._pvt_ptr[0].smCount
+    @smCount.setter
+    def smCount(self, unsigned int smCount):
+        self._pvt_ptr[0].smCount = smCount
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.coscheduledSmCount' in found_struct}}
+    @property
+    def coscheduledSmCount(self):
+        return self._pvt_ptr[0].coscheduledSmCount
+    @coscheduledSmCount.setter
+    def coscheduledSmCount(self, unsigned int coscheduledSmCount):
+        self._pvt_ptr[0].coscheduledSmCount = coscheduledSmCount
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.preferredCoscheduledSmCount' in found_struct}}
+    @property
+    def preferredCoscheduledSmCount(self):
+        return self._pvt_ptr[0].preferredCoscheduledSmCount
+    @preferredCoscheduledSmCount.setter
+    def preferredCoscheduledSmCount(self, unsigned int preferredCoscheduledSmCount):
+        self._pvt_ptr[0].preferredCoscheduledSmCount = preferredCoscheduledSmCount
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
+{{endif}}
+{{if 'cudaDevResource_st' in found_struct}}
+
+cdef class cudaDevResource_st:
+    """
+    A tagged union describing different resources identified by the
+    type field. This structure should not be directly modified outside
+    of the API that created it. struct enumcudaDevResourceTypetype;
+    union structcudaDevSmResourcesm;
+    structcudaDevWorkqueueConfigResourcewqConfig;
+    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
+    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
+    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
+    the cudaDevSmResource structure `sm` is filled in. For example,
+    `sm.smCount` will reflect the amount of streaming multiprocessors
+    available in this resource.    - If `typename` is
+    `cudaDevResourceTypeWorkqueueConfig`, the
+    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
+    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
+    cudaDevWorkqueueResource structure `wq` is filled in.
+
+    Attributes
+    ----------
+    {{if 'cudaDevResource_st.type' in found_struct}}
+    type : cudaDevResourceType
+        Type of resource, dictates which union field was last set
+    {{endif}}
+    {{if 'cudaDevResource_st._internal_padding' in found_struct}}
+    _internal_padding : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.sm' in found_struct}}
+    sm : cudaDevSmResource
+        Resource corresponding to cudaDevResourceTypeSm `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+    wqConfig : cudaDevWorkqueueConfigResource
+        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
+        `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st.wq' in found_struct}}
+    wq : cudaDevWorkqueueResource
+        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+    {{endif}}
+    {{if 'cudaDevResource_st._oversize' in found_struct}}
+    _oversize : bytes
+
+    {{endif}}
+    {{if 'cudaDevResource_st.nextResource' in found_struct}}
+    nextResource : cudaDevResource_st
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._val_ptr = <cyruntime.cudaDevResource_st *>calloc(1, sizeof(cyruntime.cudaDevResource_st))
+            self._pvt_ptr = self._val_ptr
+        else:
+            self._pvt_ptr = <cyruntime.cudaDevResource_st *>_ptr
+    def __init__(self, void_ptr _ptr = 0):
+        pass
+        {{if 'cudaDevResource_st.sm' in found_struct}}
+        self._sm = cudaDevSmResource(_ptr=<void_ptr>&self._pvt_ptr[0].sm)
+        {{endif}}
+        {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+        self._wqConfig = cudaDevWorkqueueConfigResource(_ptr=<void_ptr>&self._pvt_ptr[0].wqConfig)
+        {{endif}}
+        {{if 'cudaDevResource_st.wq' in found_struct}}
+        self._wq = cudaDevWorkqueueResource(_ptr=<void_ptr>&self._pvt_ptr[0].wq)
+        {{endif}}
+    def __dealloc__(self):
+        if self._val_ptr is not NULL:
+            free(self._val_ptr)
+        {{if 'cudaDevResource_st.nextResource' in found_struct}}
+        if self._nextResource is not NULL:
+            free(self._nextResource)
+        {{endif}}
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaDevResource_st.type' in found_struct}}
+            try:
+                str_list += ['type : ' + str(self.type)]
+            except ValueError:
+                str_list += ['type : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st._internal_padding' in found_struct}}
+            try:
+                str_list += ['_internal_padding : ' + str(self._internal_padding)]
+            except ValueError:
+                str_list += ['_internal_padding : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st.sm' in found_struct}}
+            try:
+                str_list += ['sm :\n' + '\n'.join(['    ' + line for line in str(self.sm).splitlines()])]
+            except ValueError:
+                str_list += ['sm : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+            try:
+                str_list += ['wqConfig :\n' + '\n'.join(['    ' + line for line in str(self.wqConfig).splitlines()])]
+            except ValueError:
+                str_list += ['wqConfig : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st.wq' in found_struct}}
+            try:
+                str_list += ['wq :\n' + '\n'.join(['    ' + line for line in str(self.wq).splitlines()])]
+            except ValueError:
+                str_list += ['wq : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st._oversize' in found_struct}}
+            try:
+                str_list += ['_oversize : ' + str(self._oversize)]
+            except ValueError:
+                str_list += ['_oversize : <ValueError>']
+            {{endif}}
+            {{if 'cudaDevResource_st.nextResource' in found_struct}}
+            try:
+                str_list += ['nextResource : ' + str(self.nextResource)]
+            except ValueError:
+                str_list += ['nextResource : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaDevResource_st.type' in found_struct}}
+    @property
+    def type(self):
+        if self._pvt_ptr[0].type not in _dict_cudaDevResourceType:
+            return None
+        return _dict_cudaDevResourceType[self._pvt_ptr[0].type]
+    @type.setter
+    def type(self, type not None : cudaDevResourceType):
+        self._pvt_ptr[0].type = type.value
+    {{endif}}
+    {{if 'cudaDevResource_st._internal_padding' in found_struct}}
+    @property
+    def _internal_padding(self):
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0]._internal_padding, 92)
+    @_internal_padding.setter
+    def _internal_padding(self, _internal_padding):
+        if len(_internal_padding) != 92:
+            raise ValueError("_internal_padding length must be 92, is " + str(len(_internal_padding)))
+        for i, b in enumerate(_internal_padding):
+            self._pvt_ptr[0]._internal_padding[i] = b
+    {{endif}}
+    {{if 'cudaDevResource_st.sm' in found_struct}}
+    @property
+    def sm(self):
+        return self._sm
+    @sm.setter
+    def sm(self, sm not None : cudaDevSmResource):
+        string.memcpy(&self._pvt_ptr[0].sm, <cyruntime.cudaDevSmResource*><void_ptr>sm.getPtr(), sizeof(self._pvt_ptr[0].sm))
+    {{endif}}
+    {{if 'cudaDevResource_st.wqConfig' in found_struct}}
+    @property
+    def wqConfig(self):
+        return self._wqConfig
+    @wqConfig.setter
+    def wqConfig(self, wqConfig not None : cudaDevWorkqueueConfigResource):
+        string.memcpy(&self._pvt_ptr[0].wqConfig, <cyruntime.cudaDevWorkqueueConfigResource*><void_ptr>wqConfig.getPtr(), sizeof(self._pvt_ptr[0].wqConfig))
+    {{endif}}
+    {{if 'cudaDevResource_st.wq' in found_struct}}
+    @property
+    def wq(self):
+        return self._wq
+    @wq.setter
+    def wq(self, wq not None : cudaDevWorkqueueResource):
+        string.memcpy(&self._pvt_ptr[0].wq, <cyruntime.cudaDevWorkqueueResource*><void_ptr>wq.getPtr(), sizeof(self._pvt_ptr[0].wq))
+    {{endif}}
+    {{if 'cudaDevResource_st._oversize' in found_struct}}
+    @property
+    def _oversize(self):
+        return PyBytes_FromStringAndSize(<char*>self._pvt_ptr[0]._oversize, 40)
+    @_oversize.setter
+    def _oversize(self, _oversize):
+        if len(_oversize) != 40:
+            raise ValueError("_oversize length must be 40, is " + str(len(_oversize)))
+        for i, b in enumerate(_oversize):
+            self._pvt_ptr[0]._oversize[i] = b
+    {{endif}}
+    {{if 'cudaDevResource_st.nextResource' in found_struct}}
+    @property
+    def nextResource(self):
+        arrs = [<void_ptr>self._pvt_ptr[0].nextResource + x*sizeof(cyruntime.cudaDevResource_st) for x in range(self._nextResource_length)]
+        return [cudaDevResource_st(_ptr=arr) for arr in arrs]
+    @nextResource.setter
+    def nextResource(self, val):
+        if len(val) == 0:
+            free(self._nextResource)
+            self._nextResource_length = 0
+            self._pvt_ptr[0].nextResource = NULL
+        else:
+            if self._nextResource_length != <size_t>len(val):
+                free(self._nextResource)
+                self._nextResource = <cyruntime.cudaDevResource_st*> calloc(len(val), sizeof(cyruntime.cudaDevResource_st))
+                if self._nextResource is NULL:
+                    raise MemoryError('Failed to allocate length x size memory: ' + str(len(val)) + 'x' + str(sizeof(cyruntime.cudaDevResource_st)))
+                self._nextResource_length = <size_t>len(val)
+                self._pvt_ptr[0].nextResource = self._nextResource
+            for idx in range(len(val)):
+                string.memcpy(&self._nextResource[idx], (<cudaDevResource_st>val[idx])._pvt_ptr, sizeof(cyruntime.cudaDevResource_st))
+
+    {{endif}}
+{{endif}}
 {{if 'cudalibraryHostUniversalFunctionAndDataTable' in found_struct}}
 
 cdef class cudalibraryHostUniversalFunctionAndDataTable:
@@ -14412,6 +15210,11 @@ cdef class cudaKernelNodeParamsV2:
     extra : Any
         Pointer to kernel arguments in the "extra" format
     {{endif}}
+    {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        Context in which to run the kernel. If NULL will try to use the
+        current context.
+    {{endif}}
 
     Methods
     -------
@@ -14431,6 +15234,9 @@ cdef class cudaKernelNodeParamsV2:
         {{if 'cudaKernelNodeParamsV2.blockDim' in found_struct}}
         self._blockDim = dim3(_ptr=<void_ptr>&self._pvt_ptr[0].blockDim)
         {{endif}}
+        {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+        self._ctx = cudaExecutionContext_t(_ptr=<void_ptr>&self._pvt_ptr[0].ctx)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -14474,6 +15280,12 @@ cdef class cudaKernelNodeParamsV2:
             except ValueError:
                 str_list += ['extra : <ValueError>']
             {{endif}}
+            {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+            try:
+                str_list += ['ctx : ' + str(self.ctx)]
+            except ValueError:
+                str_list += ['ctx : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -14527,6 +15339,23 @@ cdef class cudaKernelNodeParamsV2:
     def extra(self, void_ptr extra):
         self._pvt_ptr[0].extra = <void**>extra
     {{endif}}
+    {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
+    @property
+    def ctx(self):
+        return self._ctx
+    @ctx.setter
+    def ctx(self, ctx):
+        cdef cyruntime.cudaExecutionContext_t cyctx
+        if ctx is None:
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>0
+        elif isinstance(ctx, (cudaExecutionContext_t,)):
+            pctx = int(ctx)
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        else:
+            pctx = int(cudaExecutionContext_t(ctx))
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        self._ctx._pvt_ptr[0] = cyctx
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalNodeParams' in found_struct}}
 
@@ -15070,6 +15899,10 @@ cdef class cudaConditionalNodeParams:
         executed when the condition is equal to n. If the condition >=
         `size`, no body graph is executed.
     {{endif}}
+    {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+    ctx : cudaExecutionContext_t
+        CUDA Execution Context
+    {{endif}}
 
     Methods
     -------
@@ -15086,6 +15919,9 @@ cdef class cudaConditionalNodeParams:
         {{if 'cudaConditionalNodeParams.handle' in found_struct}}
         self._handle = cudaGraphConditionalHandle(_ptr=<void_ptr>&self._pvt_ptr[0].handle)
         {{endif}}
+        {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+        self._ctx = cudaExecutionContext_t(_ptr=<void_ptr>&self._pvt_ptr[0].ctx)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -15117,6 +15953,12 @@ cdef class cudaConditionalNodeParams:
             except ValueError:
                 str_list += ['phGraph_out : <ValueError>']
             {{endif}}
+            {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+            try:
+                str_list += ['ctx : ' + str(self.ctx)]
+            except ValueError:
+                str_list += ['ctx : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -15162,6 +16004,23 @@ cdef class cudaConditionalNodeParams:
         arrs = [<void_ptr>self._pvt_ptr[0].phGraph_out + x*sizeof(cyruntime.cudaGraph_t) for x in range(self.size)]
         return [cudaGraph_t(_ptr=arr) for arr in arrs]
     {{endif}}
+    {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
+    @property
+    def ctx(self):
+        return self._ctx
+    @ctx.setter
+    def ctx(self, ctx):
+        cdef cyruntime.cudaExecutionContext_t cyctx
+        if ctx is None:
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>0
+        elif isinstance(ctx, (cudaExecutionContext_t,)):
+            pctx = int(ctx)
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        else:
+            pctx = int(cudaExecutionContext_t(ctx))
+            cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+        self._ctx._pvt_ptr[0] = cyctx
+    {{endif}}
 {{endif}}
 {{if 'cudaChildGraphNodeParams' in found_struct}}
 
@@ -16189,7 +17048,7 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union7:
+cdef class anon_union8:
     """
     Attributes
     ----------
@@ -16294,7 +17153,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union7
+    updateData : anon_union8
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -16315,7 +17174,7 @@ cdef class cudaGraphKernelNodeUpdate:
         self._node = cudaGraphDeviceNode_t(_ptr=<void_ptr>&self._pvt_ptr[0].node)
         {{endif}}
         {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-        self._updateData = anon_union7(_ptr=<void_ptr>self._pvt_ptr)
+        self._updateData = anon_union8(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -16378,8 +17237,8 @@ cdef class cudaGraphKernelNodeUpdate:
     def updateData(self):
         return self._updateData
     @updateData.setter
-    def updateData(self, updateData not None : anon_union7):
-        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union7*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
+    def updateData(self, updateData not None : anon_union8):
+        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union8*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -17387,7 +18246,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union8:
+cdef class anon_union9:
     """
     Attributes
     ----------
@@ -17447,7 +18306,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union8
+    info : anon_union9
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -17466,7 +18325,7 @@ cdef class cudaAsyncNotificationInfo:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-        self._info = anon_union8(_ptr=<void_ptr>self._pvt_ptr)
+        self._info = anon_union9(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -17506,8 +18365,8 @@ cdef class cudaAsyncNotificationInfo:
     def info(self):
         return self._info
     @info.setter
-    def info(self, info not None : anon_union8):
-        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union8*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
+    def info(self, info not None : anon_union9):
+        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union9*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -17946,7 +18805,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union9:
+cdef class anon_union10:
     """
     Attributes
     ----------
@@ -18036,7 +18895,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union9
+    frame : anon_union10
 
     {{endif}}
     {{if True}}
@@ -18070,7 +18929,7 @@ cdef class cudaEglFrame_st:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if True}}
-        self._frame = anon_union9(_ptr=<void_ptr>self._pvt_ptr)
+        self._frame = anon_union10(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -18118,8 +18977,8 @@ cdef class cudaEglFrame_st:
     def frame(self):
         return self._frame
     @frame.setter
-    def frame(self, frame not None : anon_union9):
-        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union9*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
+    def frame(self, frame not None : anon_union10):
+        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union10*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
     {{endif}}
     {{if True}}
     @property
@@ -18542,7 +19401,7 @@ def cudaDeviceSynchronize():
     Returns
     -------
     cudaError_t
-        :py:obj:`~.cudaSuccess`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
 
     See Also
     --------
@@ -20463,7 +21322,7 @@ def cudaStreamCreate():
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
     with nogil:
@@ -20507,7 +21366,7 @@ def cudaStreamCreateWithFlags(unsigned int flags):
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
     with nogil:
@@ -20606,7 +21465,7 @@ def cudaStreamGetPriority(hStream):
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cuStreamGetPriority`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cuStreamGetPriority`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -26750,6 +27609,7 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
     bytes to copy. `stream` is the stream in which the operation is
     enqueued. The memory range must refer to managed memory allocated via
     :py:obj:`~.cudaMallocManaged` or declared via managed variables, or it
+    may also refer to memory allocated from a managed memory pool, or it
     may also refer to system-allocated memory on systems with non-zero
     cudaDevAttrPageableMemoryAccess.
 
@@ -28386,6 +29246,22 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     /proc/devices users can execute the following command: `mknod
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
+    To create a managed memory pool, applications must set
+    :py:obj:`~.cudaMemPoolProps`:cudaMemAllocationType to
+    :py:obj:`~.cudaMemAllocationTypeManaged`.
+    :py:obj:`~.cudaMemPoolProps`::cudaMemAllocationHandleType must also be
+    set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not supported.
+    For managed memory pools, :py:obj:`~.cudaMemPoolProps`::cudaMemLocation
+    will be treated as the preferred location for all allocations created
+    from the pool. An application can also set
+    :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location.
+    :py:obj:`~.cudaMemPoolProps.maxSize` must be set to zero for managed
+    memory pools. :py:obj:`~.cudaMemPoolProps.usage` should be zero as
+    decompress for managed memory is not supported. For managed memory
+    pools, all devices on the system must have non-zero
+    :py:obj:`~.concurrentManagedAccess`. If not, this call returns
+    :py:obj:`~.cudaErrorNotSupported`
+
     Parameters
     ----------
     poolProps : :py:obj:`~.cudaMemPoolProps`
@@ -28404,7 +29280,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
 
     Notes
     -----
-    Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
+    Specifying :py:obj:`~.cudaMemHandleTypeNone` creates a memory pool that will not support IPC.
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps is not None else NULL
@@ -35324,7 +36200,7 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
 
     Parameters
     ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
         Graph which will contain the conditional node using this handle.
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
@@ -35595,9 +36471,9 @@ def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[
     under the "CUDA environment variables" section.
 
     The `code` may be a `cubin` or `fatbin` as output by nvcc, or a NULL-
-    terminated `PTX`, either as output by nvcc or hand-written. A fatbin
-    should also contain relocatable code when doing separate compilation.
-    Please also see the documentation for nvrtc
+    terminated `PTX`, either as output by nvcc or hand-written, or `Tile`
+    IR data. A fatbin should also contain relocatable code when doing
+    separate compilation. Please also see the documentation for nvrtc
     (https://docs.nvidia.com/cuda/nvrtc/index.html), nvjitlink
     (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
     (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information
@@ -35695,8 +36571,9 @@ def cudaLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[cudaJitO
 
     The file should be a `cubin` file as output by nvcc, or a `PTX` file
     either as output by nvcc or handwritten, or a `fatbin` file as output
-    by nvcc. A fatbin should also contain relocatable code when doing
-    separate compilation. Please also see the documentation for nvrtc
+    by nvcc or hand-written, or `Tile` IR file. A fatbin should also
+    contain relocatable code when doing separate compilation. Please also
+    see the documentation for nvrtc
     (https://docs.nvidia.com/cuda/nvrtc/index.html), nvjitlink
     (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
     (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information
@@ -36198,6 +37075,224 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
     return (_dict_cudaError_t[err],)
 {{endif}}
 
+{{if 'cudaExecutionCtxDestroy' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaExecutionCtxDestroy(ctx):
+    """ Destroy a execution context.
+
+    Destroys the specified execution context `ctx`. It is the
+    responsibility of the caller to ensure that no API call issues using
+    `ctx` while :py:obj:`~.cudaExecutionCtxDestroy()` is executing or
+    subsequently.
+
+    If `ctx` is a green context, any resources provisioned for it (that
+    were initially available via the resource descriptor) are released as
+    well.
+
+    The API does not destroy streams created via
+    :py:obj:`~.cudaExecutionCtxStreamCreate`. Users are expected to destroy
+    these streams explicitly using :py:obj:`~.cudaStreamDestroy` to avoid
+    resource leaks. Once the execution context is destroyed, any subsequent
+    API calls involving these streams will return
+    :py:obj:`~.cudaErrorStreamDetached` with the exception of the following
+    APIs:
+
+    - :py:obj:`~.cudaStreamDestroy`. Note this is only supported on CUDA
+      drivers 13.1 and above.
+
+    Additionally, the API will invalidate all active captures on these
+    streams.
+
+    Passing in a `ctx` that was not explicitly created via CUDA Runtime
+    APIs is not allowed and will result in undefined behavior.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.cudaExecutionContext_t`
+        Execution context to destroy (required parameter, see note below)
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
+
+    See Also
+    --------
+    :py:obj:`~.cudaGreenCtxCreate`
+    """
+    cdef cyruntime.cudaExecutionContext_t cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (cudaExecutionContext_t,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(cudaExecutionContext_t(ctx))
+    cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+    with nogil:
+        err = cyruntime.cudaExecutionCtxDestroy(cyctx)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
+{{if 'cudaExecutionCtxSynchronize' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaExecutionCtxSynchronize(ctx):
+    """ Block for the specified execution context's tasks to complete.
+
+    Blocks until the specified execution context has completed all
+    preceding requested tasks. If the specified execution context is the
+    device (primary) context obtained via
+    :py:obj:`~.cudaDeviceGetExecutionCtx`, green contexts that have been
+    created on the device will also be synchronized.
+
+    The API returns an error if one of the preceding tasks failed.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.cudaExecutionContext_t`
+        Execution context to synchronize (required parameter, see note
+        below)
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorInvalidValue`
+
+    See Also
+    --------
+    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cudaDeviceSynchronize`, :py:obj:`~.cuCtxSynchronize_v2`
+    """
+    cdef cyruntime.cudaExecutionContext_t cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (cudaExecutionContext_t,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(cudaExecutionContext_t(ctx))
+    cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+    with nogil:
+        err = cyruntime.cudaExecutionCtxSynchronize(cyctx)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
+{{if 'cudaExecutionCtxRecordEvent' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaExecutionCtxRecordEvent(ctx, event):
+    """ Records an event for the specified execution context.
+
+    Captures in `event` all the activities of the execution context `ctx`
+    at the time of this call. `event` and `ctx` must be from the same CUDA
+    device, otherwise :py:obj:`~.cudaErrorInvalidHandle` will be returned.
+    Calls such as :py:obj:`~.cudaEventQuery()` or
+    :py:obj:`~.cudaExecutionCtxWaitEvent()` will then examine or wait for
+    completion of the work that was captured. Uses of `ctx` after this call
+    do not modify `event`. If the execution context passed to `ctx` is the
+    device (primary) context obtained via
+    :py:obj:`~.cudaDeviceGetExecutionCtx()`, `event` will capture all the
+    activities of the green contexts created on the device as well.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.cudaExecutionContext_t`
+        Execution context to record event for (required parameter, see note
+        below)
+    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
+        Event to record
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
+
+    See Also
+    --------
+    :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaExecutionCtxWaitEvent`, :py:obj:`~.cuCtxRecordEvent`, :py:obj:`~.cuGreenCtxRecordEvent`
+
+    Notes
+    -----
+    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` if the specified execution context `ctx` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
+    """
+    cdef cyruntime.cudaEvent_t cyevent
+    if event is None:
+        pevent = 0
+    elif isinstance(event, (cudaEvent_t,driver.CUevent)):
+        pevent = int(event)
+    else:
+        pevent = int(cudaEvent_t(event))
+    cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
+    cdef cyruntime.cudaExecutionContext_t cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (cudaExecutionContext_t,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(cudaExecutionContext_t(ctx))
+    cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+    with nogil:
+        err = cyruntime.cudaExecutionCtxRecordEvent(cyctx, cyevent)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
+{{if 'cudaExecutionCtxWaitEvent' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaExecutionCtxWaitEvent(ctx, event):
+    """ Make an execution context wait on an event.
+
+    Makes all future work submitted to execution context `ctx` wait for all
+    work captured in `event`. The synchronization will be performed on the
+    device and will not block the calling CPU thread. See
+    :py:obj:`~.cudaExecutionCtxRecordEvent()` for details on what is
+    captured by an event. If the execution context passed to `ctx` is the
+    device (primary) context obtained via
+    :py:obj:`~.cudaDeviceGetExecutionCtx()`, all green contexts created on
+    the device will wait for `event` as well.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.cudaExecutionContext_t`
+        Execution context to wait for (required parameter, see note below)
+    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
+        Event to wait on
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
+
+    See Also
+    --------
+    :py:obj:`~.cudaExecutionCtxRecordEvent`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuCtxWaitEvent`, :py:obj:`~.cuGreenCtxWaitEvent`
+
+    Notes
+    -----
+    `event` may be from a different execution context or device than `ctx`.
+
+    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` and invalidate the capture if the specified event `event` is part of an ongoing capture sequence or if the specified execution context `ctx` has a stream in the capture mode.
+    """
+    cdef cyruntime.cudaEvent_t cyevent
+    if event is None:
+        pevent = 0
+    elif isinstance(event, (cudaEvent_t,driver.CUevent)):
+        pevent = int(event)
+    else:
+        pevent = int(cudaEvent_t(event))
+    cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
+    cdef cyruntime.cudaExecutionContext_t cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (cudaExecutionContext_t,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(cudaExecutionContext_t(ctx))
+    cyctx = <cyruntime.cudaExecutionContext_t><void_ptr>pctx
+    with nogil:
+        err = cyruntime.cudaExecutionCtxWaitEvent(cyctx, cyevent)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
 {{if 'cudaGetExportTable' in found_functions}}
 
 @cython.embedsignature(True)
@@ -37590,6 +38685,12 @@ def sizeof(objType):
     {{if 'dim3' in found_struct}}
     if objType == dim3:
         return sizeof(cyruntime.dim3){{endif}}
+    {{if 'cudaDevResourceDesc_t' in found_types}}
+    if objType == cudaDevResourceDesc_t:
+        return sizeof(cyruntime.cudaDevResourceDesc_t){{endif}}
+    {{if 'cudaExecutionContext_t' in found_types}}
+    if objType == cudaExecutionContext_t:
+        return sizeof(cyruntime.cudaExecutionContext_t){{endif}}
     {{if 'cudaChannelFormatDesc' in found_struct}}
     if objType == cudaChannelFormatDesc:
         return sizeof(cyruntime.cudaChannelFormatDesc){{endif}}
@@ -37740,6 +38841,27 @@ def sizeof(objType):
     {{if 'cudaExternalSemaphoreWaitParams' in found_struct}}
     if objType == cudaExternalSemaphoreWaitParams:
         return sizeof(cyruntime.cudaExternalSemaphoreWaitParams){{endif}}
+    {{if 'cudaDevSmResource' in found_struct}}
+    if objType == cudaDevSmResource:
+        return sizeof(cyruntime.cudaDevSmResource){{endif}}
+    {{if 'cudaDevWorkqueueConfigResource' in found_struct}}
+    if objType == cudaDevWorkqueueConfigResource:
+        return sizeof(cyruntime.cudaDevWorkqueueConfigResource){{endif}}
+    {{if 'cudaDevWorkqueueResource' in found_struct}}
+    if objType == cudaDevWorkqueueResource:
+        return sizeof(cyruntime.cudaDevWorkqueueResource){{endif}}
+    {{if 'cudaDevSmResourceGroupParams_st' in found_struct}}
+    if objType == cudaDevSmResourceGroupParams_st:
+        return sizeof(cyruntime.cudaDevSmResourceGroupParams_st){{endif}}
+    {{if 'cudaDevSmResourceGroupParams' in found_types}}
+    if objType == cudaDevSmResourceGroupParams:
+        return sizeof(cyruntime.cudaDevSmResourceGroupParams){{endif}}
+    {{if 'cudaDevResource_st' in found_struct}}
+    if objType == cudaDevResource_st:
+        return sizeof(cyruntime.cudaDevResource_st){{endif}}
+    {{if 'cudaDevResource' in found_types}}
+    if objType == cudaDevResource:
+        return sizeof(cyruntime.cudaDevResource){{endif}}
     {{if 'cudaStream_t' in found_types}}
     if objType == cudaStream_t:
         return sizeof(cyruntime.cudaStream_t){{endif}}
@@ -37945,6 +39067,14 @@ def sizeof(objType):
 
 cdef int _add_native_handle_getters() except?-1:
     from cuda.bindings.utils import _add_cuda_native_handle_getter
+    {{if 'cudaDevResourceDesc_t' in found_types}}
+    def cudaDevResourceDesc_t_getter(cudaDevResourceDesc_t x): return <uintptr_t><void*><cyruntime.cudaDevResourceDesc_t>(x._pvt_ptr[0])
+    _add_cuda_native_handle_getter(cudaDevResourceDesc_t, cudaDevResourceDesc_t_getter)
+    {{endif}}
+    {{if 'cudaExecutionContext_t' in found_types}}
+    def cudaExecutionContext_t_getter(cudaExecutionContext_t x): return <uintptr_t><void*><cyruntime.cudaExecutionContext_t>(x._pvt_ptr[0])
+    _add_cuda_native_handle_getter(cudaExecutionContext_t, cudaExecutionContext_t_getter)
+    {{endif}}
     {{if 'cudaArray_t' in found_types}}
     def cudaArray_t_getter(cudaArray_t x): return <uintptr_t><void*><cyruntime.cudaArray_t>(x._pvt_ptr[0])
     _add_cuda_native_handle_getter(cudaArray_t, cudaArray_t_getter)
diff --git a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
index 038492f6ab..7041c2338a 100644
--- a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
+++ b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
@@ -48,6 +48,7 @@
     "8.7": (12, 8),
     "8.8": (12, 9),
     "9.0": (13, 0),
+    "9.1": (13, 1),
 }
 
 
diff --git a/cuda_bindings/docs/nv-versions.json b/cuda_bindings/docs/nv-versions.json
index 369b94db56..d27f90a398 100644
--- a/cuda_bindings/docs/nv-versions.json
+++ b/cuda_bindings/docs/nv-versions.json
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/latest/"
     },
+    {
+        "version": "13.1.0",
+        "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.1.0/"
+    },
     {
         "version": "13.0.3",
         "url": "https://nvidia.github.io/cuda-python/cuda-bindings/13.0.3/"
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index bbc0ac2f6f..f1e9a13949 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -321,6 +321,71 @@ Data types used by CUDA driver
 
         When using stream capture, create an event wait node instead of the default behavior. This flag is invalid when used outside of capture.
 
+.. autoclass:: cuda.bindings.driver.CUatomicOperation
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_ADD
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MIN
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MAX
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_INCREMENT
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_DECREMENT
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_AND
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_OR
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_XOR
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_EXCHANGE
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_CAS
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_ADD
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MIN
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MAX
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_MAX
+
+.. autoclass:: cuda.bindings.driver.CUatomicOperationCapability
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SIGNED
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_UNSIGNED
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_REDUCTION
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_32
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_64
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_128
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_VECTOR_32x4
+
 .. autoclass:: cuda.bindings.driver.CUstreamWaitValue_flags
 
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_GEQ
@@ -397,6 +462,12 @@ Data types used by CUDA driver
         Insert a memory barrier of the specified type
 
 
+    .. autoattribute:: cuda.bindings.driver.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_ATOMIC_REDUCTION
+
+
+        Perform a atomic reduction. See :py:obj:`~.CUstreamBatchMemOpParams`::atomicReduction
+
+
     .. autoattribute:: cuda.bindings.driver.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES
 
 
@@ -415,6 +486,32 @@ Data types used by CUDA driver
 
         Limit memory barrier scope to the GPU.
 
+.. autoclass:: cuda.bindings.driver.CUstreamAtomicReductionOpType
+
+    .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_OR
+
+
+        Performs an atomic OR: *(address) = *(address) | value
+
+
+    .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_AND
+
+
+        Performs an atomic AND: *(address) = *(address) & value
+
+
+    .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_ADD
+
+
+        Performs an atomic ADD: *(address) = *(address) + value
+
+.. autoclass:: cuda.bindings.driver.CUstreamAtomicReductionDataType
+
+    .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionDataType.CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32
+
+
+    .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionDataType.CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_64
+
 .. autoclass:: cuda.bindings.driver.CUoccupancy_flags
 
     .. autoattribute:: cuda.bindings.driver.CUoccupancy_flags.CU_OCCUPANCY_DEFAULT
@@ -756,6 +853,90 @@ Data types used by CUDA driver
         4 channel unorm R10G10B10A2 RGB format
 
 
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_PACKED_422
+
+
+        4 channel unsigned 8-bit YUV packed format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_PACKED_444
+
+
+        4 channel unsigned 8-bit YUV packed format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_SEMIPLANAR_420
+
+
+        3 channel unsigned 8-bit YUV semi-planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_SEMIPLANAR_420
+
+
+        3 channel unsigned 16-bit YUV semi-planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_SEMIPLANAR_422
+
+
+        3 channel unsigned 8-bit YUV semi-planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_SEMIPLANAR_422
+
+
+        3 channel unsigned 16-bit YUV semi-planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_SEMIPLANAR_444
+
+
+        3 channel unsigned 8-bit YUV semi-planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_SEMIPLANAR_444
+
+
+        3 channel unsigned 16-bit YUV semi-planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_PLANAR_420
+
+
+        3 channel unsigned 8-bit YUV planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_PLANAR_420
+
+
+        3 channel unsigned 16-bit YUV planar format, with 4:2:0 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_PLANAR_422
+
+
+        3 channel unsigned 8-bit YUV planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_PLANAR_422
+
+
+        3 channel unsigned 16-bit YUV planar format, with 4:2:2 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT8_PLANAR_444
+
+
+        3 channel unsigned 8-bit YUV planar format, with 4:4:4 sampling
+
+
+    .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_UINT16_PLANAR_444
+
+
+        3 channel unsigned 16-bit YUV planar format, with 4:4:4 sampling
+
+
     .. autoattribute:: cuda.bindings.driver.CUarray_format.CU_AD_FORMAT_MAX
 
 .. autoclass:: cuda.bindings.driver.CUaddress_mode
@@ -1716,6 +1897,12 @@ Data types used by CUDA driver
         Link between the device and the host supports only some native atomic operations
 
 
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED
+
+
+        Device supports atomic reduction operations in stream batch memory operations
+
+
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX
 
 .. autoclass:: cuda.bindings.driver.CUpointer_attribute
@@ -2522,6 +2709,14 @@ Data types used by CUDA driver
         Applies to: compiler only
 
 
+    .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_BINARY_LOADER_THREAD_COUNT
+
+
+        This option specifies the maximum number of concurrent threads to use when compiling device code. If the specified value is 1, the option will be ignored. If the specified value is 0, the number of threads will match the number of CPUs on the underlying machine. Otherwise, if the option is N, then up to N threads will be used. This option is ignored if the env var CUDA_BINARY_LOADER_THREAD_COUNT is set. Option type: unsigned int
+
+        Applies to: compiler and linker
+
+
     .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_NUM_OPTIONS
 
 .. autoclass:: cuda.bindings.driver.CUjit_target
@@ -4035,6 +4230,12 @@ Data types used by CUDA driver
         This error indicates that an error happened during the key rotation sequence.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_STREAM_DETACHED
+
+
+        This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
+
+
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_UNKNOWN
 
 
@@ -4077,71 +4278,6 @@ Data types used by CUDA driver
 
         Only some CUDA-valid atomic operations over the link are supported.
 
-.. autoclass:: cuda.bindings.driver.CUatomicOperation
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_ADD
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MIN
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MAX
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_INCREMENT
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_DECREMENT
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_AND
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_OR
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_XOR
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_EXCHANGE
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_CAS
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_ADD
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MIN
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MAX
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_MAX
-
-.. autoclass:: cuda.bindings.driver.CUatomicOperationCapability
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SIGNED
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_UNSIGNED
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_REDUCTION
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_32
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_64
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_128
-
-
-    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_VECTOR_32x4
-
 .. autoclass:: cuda.bindings.driver.CUresourceViewFormat
 
     .. autoattribute:: cuda.bindings.driver.CUresourceViewFormat.CU_RES_VIEW_FORMAT_NONE
@@ -6701,7 +6837,7 @@ This section describes the CUDA multicast object operations exposed by the low-l
 
 
 
-A multicast object created via cuMulticastCreate enables certain memory operations to be broadcast to a team of devices. Devices can be added to a multicast object via cuMulticastAddDevice. Memory can be bound on each participating device via either cuMulticastBindMem or cuMulticastBindAddr. Multicast objects can be mapped into a device's virtual address space using the virtual memmory management APIs (see cuMemMap and cuMemSetAccess).
+A multicast object created via cuMulticastCreate enables certain memory operations to be broadcast to a team of devices. Devices can be added to a multicast object via cuMulticastAddDevice. Memory can be bound on each participating device via cuMulticastBindMem, cuMulticastBindMem_v2, cuMulticastBindAddr, or cuMulticastBindAddr_v2. Multicast objects can be mapped into a device's virtual address space using the virtual memmory management APIs (see cuMemMap and cuMemSetAccess).
 
 
 
@@ -6716,7 +6852,9 @@ Support for multicast on a specific device can be queried using the device attri
 .. autofunction:: cuda.bindings.driver.cuMulticastCreate
 .. autofunction:: cuda.bindings.driver.cuMulticastAddDevice
 .. autofunction:: cuda.bindings.driver.cuMulticastBindMem
+.. autofunction:: cuda.bindings.driver.cuMulticastBindMem_v2
 .. autofunction:: cuda.bindings.driver.cuMulticastBindAddr
+.. autofunction:: cuda.bindings.driver.cuMulticastBindAddr_v2
 .. autofunction:: cuda.bindings.driver.cuMulticastUnbind
 .. autofunction:: cuda.bindings.driver.cuMulticastGetGranularity
 
@@ -7142,6 +7280,9 @@ This section describes the coredump attribute control functions of the low-level
     .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_SKIP_CONSTBANK_MEMORY
 
 
+    .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_GZIP_COMPRESS
+
+
     .. autoattribute:: cuda.bindings.driver.CUCoredumpGenerationFlags.CU_COREDUMP_LIGHTWEIGHT_FLAGS
 
 .. autofunction:: cuda.bindings.driver.cuCoredumpGetAttribute
@@ -7152,13 +7293,13 @@ This section describes the coredump attribute control functions of the low-level
 Green Contexts
 --------------
 
-This section describes the APIs for creation and manipulation of green contexts in the CUDA driver. Green contexts are a lightweight alternative to traditional contexts, with the ability to pass in a set of resources that they should be initialized with. This allows the developer to represent distinct spatial partitions of the GPU, provision resources for them, and target them via the same programming model that CUDA exposes (streams, kernel launches, etc.).
+This section describes the APIs for creation and manipulation of green contexts in the CUDA driver. Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
 
 
 
-There are 4 main steps to using these new set of APIs.
+Here are the broad initial steps to follow to get started:
 
-- (1) Start with an initial set of resources, for example via cuDeviceGetDevResource. Only SM type is supported today.
+- (1) Start with an initial set of resources. For SM resources, they can be fetched via cuDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cuDeviceGetDevResource API.
 
 
 
@@ -7166,7 +7307,7 @@ There are 4 main steps to using these new set of APIs.
 
 
 
-- (2) Partition this set of resources by providing them as input to a partition API, for example: cuDevSmResourceSplitByCount.
+- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cuDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
 
 
 
@@ -7182,7 +7323,16 @@ There are 4 main steps to using these new set of APIs.
 
 
 
-- (4) Provision the resources and create a green context via cuGreenCtxCreate.
+- (4) Create a green context via cuGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
+
+
+
+
+
+
+
+- (5) Create a stream via cuGreenCtxStreamCreate, and use it throughout your application.
+
 
 
 
@@ -7194,53 +7344,118 @@ There are 4 main steps to using these new set of APIs.
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_SM``\ , the partitions created have minimum SM count requirements, often rounding up and aligning the minCount provided to cuDevSmResourceSplitByCount. These requirements can be queried with cuDeviceGetDevResource from step (1) above to determine the minimum partition size (``sm.minSmPartitionSize``\ ) and alignment granularity (``sm.smCoscheduledAlignment``\ ).
 
+SMs
 
 
-While it's recommended to use cuDeviceGetDevResource for accurate information, here is a guideline for each compute architecture:
 
-- On Compute Architecture 6.X: The minimum count is 2 SMs and must be a multiple of 2.
 
 
+There are two possible partition operations - with cuDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cuDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cuDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
 
+- On Compute Architecture 7.X, 8.X, and all Tegra SoC:
 
 
 
 
-- On Compute Architecture 7.X: The minimum count is 2 SMs and must be a multiple of 2.
 
+  - The smCount must be a multiple of 2.
 
 
 
 
 
 
-- On Compute Architecture 8.X: The minimum count is 4 SMs and must be a multiple of 2.
 
+  - The alignment (and default value of coscheduledSmCount) is 2.
 
 
 
 
 
 
-- On Compute Architecture 9.0+: The minimum count is 8 SMs and must be a multiple of 8.
 
 
 
+- On Compute Architecture 9.0+:
 
 
 
 
 
+  - The smCount must be a multiple of 8, or coscheduledSmCount if provided.
 
 
 
-In the future, flags can be provided to tradeoff functional and performance characteristics versus finer grained SM partitions.
 
 
 
-Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources (like HW connections, see ::CUDA_DEVICE_MAX_CONNECTIONS) that could cause a dependency. Additionally, in certain scenarios, it is possible for the workload to run on more SMs than was provisioned (but never less). The following are two scenarios which can exhibit this behavior:
+
+  - The alignment (and default value of coscheduledSmCount) is 8. While the maximum value for coscheduled SM count is 32 on all Compute Architecture 9.0+, it's recommended to follow cluster size requirements. The portable cluster size and the max cluster size should be used in order to benefit from this co-scheduling.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Workqueues
+
+
+
+
+
+For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+
+- ``CU_WORKQUEUE_SCOPE_DEVICE_CTX:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+
+
+
+
+
+
+
+- ``CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+
+
+
+
+
+
+
+
+
+
+
+The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the primary context via cuCtxGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+
+
+
+For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``\ , the resource represents a pre-existing workqueue that can be retrieved from existing contexts or green contexts. This allows reusing workqueue resources across different green contexts.
+
+
+
+
+
+On Concurrency
+
+
+
+
+
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+
+
+
+Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
 - On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
 
@@ -7253,8 +7468,14 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 - On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future kernels running under green contexts may use and share an additional set of 2 SMs.
 
 .. autoclass:: cuda.bindings.driver.CUdevSmResource_st
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueConfigResource_st
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueResource_st
+.. autoclass:: cuda.bindings.driver.CU_DEV_SM_RESOURCE_GROUP_PARAMS_st
 .. autoclass:: cuda.bindings.driver.CUdevResource_st
 .. autoclass:: cuda.bindings.driver.CUdevSmResource
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueConfigResource
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueResource
+.. autoclass:: cuda.bindings.driver.CU_DEV_SM_RESOURCE_GROUP_PARAMS
 .. autoclass:: cuda.bindings.driver.CUdevResource
 .. autoclass:: cuda.bindings.driver.CUgreenCtxCreate_flags
 
@@ -7263,12 +7484,19 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 
         Required. Creates a default stream to use inside the green context
 
-.. autoclass:: cuda.bindings.driver.CUdevSmResourceSplit_flags
+.. autoclass:: cuda.bindings.driver.CUdevSmResourceGroup_flags
+
+    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_DEFAULT
 
-    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceSplit_flags.CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING
 
+    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL
 
-    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceSplit_flags.CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE
+.. autoclass:: cuda.bindings.driver.CUdevSmResourceSplitByCount_flags
+
+    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceSplitByCount_flags.CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevSmResourceSplitByCount_flags.CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE
 
 .. autoclass:: cuda.bindings.driver.CUdevResourceType
 
@@ -7280,8 +7508,36 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 
         Streaming multiprocessors related information
 
+
+    .. autoattribute:: cuda.bindings.driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
+
+
+        Workqueue configuration related information
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE
+
+
+        Pre-existing workqueue related information
+
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueConfigScope
+
+    .. autoattribute:: cuda.bindings.driver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_DEVICE_CTX
+
+
+        Use all shared workqueue resources across all contexts. Default driver behaviour.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED
+
+
+        When possible, use non-overlapping workqueue resources with other balanced green contexts.
+
 .. autoclass:: cuda.bindings.driver.CUdevResourceDesc
 .. autoclass:: cuda.bindings.driver.CUdevSmResource
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueConfigResource
+.. autoclass:: cuda.bindings.driver.CUdevWorkqueueResource
+.. autoclass:: cuda.bindings.driver.CU_DEV_SM_RESOURCE_GROUP_PARAMS
 .. autofunction:: cuda.bindings.driver._CONCAT_OUTER
 .. autofunction:: cuda.bindings.driver.cuGreenCtxCreate
 .. autofunction:: cuda.bindings.driver.cuGreenCtxDestroy
@@ -7297,7 +7553,7 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 .. autofunction:: cuda.bindings.driver.cuGreenCtxStreamCreate
 .. autofunction:: cuda.bindings.driver.cuGreenCtxGetId
 .. autoattribute:: cuda.bindings.driver.RESOURCE_ABI_VERSION
-.. autoattribute:: cuda.bindings.driver.RESOURCE_ABI_EXTERNAL_BYTES
+.. autoattribute:: cuda.bindings.driver.RESOURCE_ABI_BYTES
 .. autoattribute:: cuda.bindings.driver._CONCAT_INNER
 .. autoattribute:: cuda.bindings.driver._CONCAT_OUTER
 
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index 079cd39aad..959c37a7f4 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -622,6 +622,26 @@ Inhibit all warning messages.
 
 
 
+  - ``--Wreorder``\  (``-Wreorder``\ )
+
+Generate warnings when member initializers are reordered.
+
+
+
+
+
+
+
+  - ``--warning-as-error=``\  <kind>,... (``-Werror``\ )
+
+Make warnings of the specified kinds into errors. The following is the list of warning kinds accepted by this option:
+
+
+
+
+
+
+
   - ``--restrict``\  (``-restrict``\ )
 
 Programmer assertion that all kernel pointer parameters are restrict pointers.
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index d155f85ebc..3f5931a4da 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -608,6 +608,278 @@ This section describes the library management functions of the CUDA runtime appl
 .. autofunction:: cuda.bindings.runtime.cudaLibraryEnumerateKernels
 .. autofunction:: cuda.bindings.runtime.cudaKernelSetAttributeForDevice
 
+Execution Context Management
+----------------------------
+
+This section describes the execution context management functions of the CUDA runtime application programming interface.
+
+
+
+
+
+**Overview**
+
+
+
+A CUDA execution context cudaExecutionContext_t serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
+
+There are two primary ways today to obtain an execution context:
+
+- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
+
+
+
+
+
+
+
+- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
+
+
+
+
+
+
+
+
+
+Once you have an execution context at hand, you can perform context-level operations via the CUDA Runtime APIs. This includes:
+
+- Submitting work via streams created with cudaExecutionCtxStreamCreate.
+
+
+
+
+
+
+
+- Querying context via cudaExecutionCtxGetDevResource, cudaExecutionCtxGetDevice, etc.
+
+
+
+
+
+
+
+- Synchronizing and tracking context-level operations via cudaExecutionCtxSynchronize, cudaExecutionCtxRecordEvent, cudaExecutionCtxWaitEvent.
+
+
+
+
+
+
+
+- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
+
+
+
+
+
+
+
+
+
+Note: The above APIs take in an explicit cudaExecutionContext_t handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return cudaErrorInvalidValue.
+
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ::CUcontext or ::CUgreenCtx.
+
+
+
+
+
+**Lifetime of CUDA Resources**
+
+
+
+The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of cudaDeviceReset() should be avoided to persist the lifetime of these resources.
+
+
+
+
+
+**APIs Operating on Current Context**
+
+
+
+The CUDA runtime does not provide a way to set an execution context as current. Since, the majority of the runtime APIs operate on the current context, we document below how the developer can work with these APIs.
+
+
+
+**APIs Operating on Device Resources**
+
+
+
+To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), developers are expected to call cudaSetDevice() prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using cudaExecutionCtxGetDevice().
+
+
+
+
+
+**APIs Operating on Context Resources**
+
+
+
+These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See cudaExecutionCtxStreamCreate for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
+
+
+
+
+
+
+
+**Green Contexts**
+
+
+
+Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
+
+Here are the broad initial steps to follow to get started:
+
+- (1) Start with an initial set of resources. For SM resources, they can be fetched via cudaDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cudaDeviceGetDevResource API.
+
+
+
+
+
+
+
+- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cudaDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
+
+
+
+
+
+
+
+- (3) Finalize the specification of resources by creating a descriptor via cudaDevResourceGenerateDesc.
+
+
+
+
+
+
+
+- (4) Create a green context via cudaGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
+
+
+
+
+
+
+
+- (5) Create a stream via cudaExecutionCtxStreamCreate, and use it throughout your application.
+
+
+
+
+
+
+
+
+
+SMs
+
+There are two possible partition operations - with cudaDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cudaDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cudaDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
+
+- On Compute Architecture 7.X, 8.X, and all Tegra SoC:
+
+
+
+
+
+  - The smCount must be a multiple of 2.
+
+
+
+
+
+
+
+  - The alignment (and default value of coscheduledSmCount) is 2.
+
+
+
+
+
+
+
+
+
+- On Compute Architecture 9.0+:
+
+
+
+
+
+  - The smCount must be a multiple of 8, or coscheduledSmCount if provided.
+
+
+
+
+
+
+
+  - The alignment (and default value of coscheduledSmCount) is 8. While the maximum value for coscheduled SM count is 32 on all Compute Architecture 9.0+, it's recommended to follow cluster size requirements. The portable cluster size and the max cluster size should be used in order to benefit from this co-scheduling.
+
+
+
+
+
+
+
+
+
+
+
+Workqueues
+
+For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+
+- ``cudaDevWorkqueueConfigScopeDeviceCtx:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+
+
+
+
+
+
+
+- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+
+
+
+
+
+
+
+
+
+The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+
+For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
+
+On Concurrency
+
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced``\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+
+Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
+
+
+
+- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+
+
+
+
+
+
+
+- On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future kernels running under green contexts may use and share an additional set of 2 SMs.
+
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxDestroy
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxSynchronize
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxRecordEvent
+.. autofunction:: cuda.bindings.runtime.cudaExecutionCtxWaitEvent
+
 C++ API Routines
 ----------------
 C++-style interface built on top of CUDA runtime API.
@@ -631,11 +903,47 @@ This section describes the interactions between the CUDA Driver API and the CUDA
 
 
 
-**Primary Contexts**
+**Execution Contexts**
+
+
+
+The CUDA Runtime provides cudaExecutionContext_t as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
+
+There are two primary ways to obtain an execution context:
+
+- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
+
+
+
+
+
+
+
+- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
 
 
 
-There exists a one to one relationship between CUDA devices in the CUDA Runtime API and ::CUcontext s in the CUDA Driver API within a process. The specific context which the CUDA Runtime API uses for a device is called the device's primary context. From the perspective of the CUDA Runtime API, a device and its primary context are synonymous.
+
+
+
+
+
+
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ::CUcontext or ::CUgreenCtx.
+
+
+
+
+
+**Primary Context (aka Device Execution Context)**
+
+
+
+The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to cudaDeviceGetExecutionCtx(). There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
+
+From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
+
+Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ::cuCtxSetCurrent()) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
 
 
 
@@ -645,16 +953,12 @@ There exists a one to one relationship between CUDA devices in the CUDA Runtime
 
 
 
-CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to to the calling host thread.
+Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ::CUcontext which is current to the calling host thread. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
 
 The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
 
 The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
 
-The CUDA Runtime API will automatically initialize the primary context for a device at the first CUDA Runtime API call which requires an active context. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context for a device will be selected, made current to the calling thread, and initialized.
-
-The context which the CUDA Runtime API initializes will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
-
 Primary contexts will remain active until they are explicitly deinitialized using cudaDeviceReset(). The function cudaDeviceReset() will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
 
 Note that primary contexts are shared resources. It is recommended that the primary context not be reset except just before exit or to recover from an unspecified launch failure.
@@ -663,21 +967,19 @@ Note that primary contexts are shared resources. It is recommended that the prim
 
 
 
-**Context Interoperability**
+**CUcontext Interoperability**
 
 
 
-Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended that the implicit one-to-one device-to-context mapping for the process provided by the CUDA Runtime API be used.
+Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
 If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
 
-The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. 
-
- To use the pointer query and peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
+The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary CUcontext is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
 
 All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
 
-Please note that attaching to legacy contexts (those with a version of 3010 as returned by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
+Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
 
 
 
@@ -828,6 +1130,11 @@ Data types used by CUDA Runtime
 .. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreHandleDesc
 .. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreSignalParams
 .. autoclass:: cuda.bindings.runtime.cudaExternalSemaphoreWaitParams
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResource
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigResource
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueResource
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams_st
+.. autoclass:: cuda.bindings.runtime.cudaDevResource_st
 .. autoclass:: cuda.bindings.runtime.cudalibraryHostUniversalFunctionAndDataTable
 .. autoclass:: cuda.bindings.runtime.cudaKernelNodeParams
 .. autoclass:: cuda.bindings.runtime.cudaKernelNodeParamsV2
@@ -2348,6 +2655,12 @@ Data types used by CUDA Runtime
         This error indicates one or more resources are insufficient or non-applicable for the operation.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamDetached
+
+
+        This error indicates that the requested operation is not permitted because the stream is in a detached state. This can occur if the green context associated with the stream has been destroyed, limiting the stream's operational capabilities.
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnknown
 
 
@@ -4568,6 +4881,55 @@ Data types used by CUDA Runtime
 
         Handle is an opaque handle file descriptor referencing a timeline semaphore
 
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupDefault
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupBackfill
+
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitIgnoreSmCoscheduling
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevSmResourceSplitByCount_flags.cudaDevSmResourceSplitMaxPotentialClusterSize
+
+.. autoclass:: cuda.bindings.runtime.cudaDevResourceType
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeInvalid
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeSm
+
+
+        Streaming multiprocessors related information
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueueConfig
+
+
+        Workqueue configuration related information
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevResourceType.cudaDevResourceTypeWorkqueue
+
+
+        Pre-existing workqueue related information
+
+.. autoclass:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeDeviceCtx
+
+
+        Use all shared workqueue resources on the device. Default driver behaviour.
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDevWorkqueueConfigScope.cudaDevWorkqueueConfigScopeGreenCtxBalanced
+
+
+        When possible, use non-overlapping workqueue resources with other balanced green contexts.
+
 .. autoclass:: cuda.bindings.runtime.cudaJitOption
 
     .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitMaxRegisters
@@ -5470,6 +5832,8 @@ Data types used by CUDA Runtime
 .. autoclass:: cuda.bindings.runtime.cudaEglPlaneDesc
 .. autoclass:: cuda.bindings.runtime.cudaEglFrame
 .. autoclass:: cuda.bindings.runtime.cudaEglStreamConnection
+.. autoclass:: cuda.bindings.runtime.cudaDevResourceDesc_t
+.. autoclass:: cuda.bindings.runtime.cudaExecutionContext_t
 .. autoclass:: cuda.bindings.runtime.cudaArray_t
 .. autoclass:: cuda.bindings.runtime.cudaArray_const_t
 .. autoclass:: cuda.bindings.runtime.cudaMipmappedArray_t
@@ -5480,6 +5844,8 @@ Data types used by CUDA Runtime
 .. autoclass:: cuda.bindings.runtime.cudaIpcEventHandle_t
 .. autoclass:: cuda.bindings.runtime.cudaIpcMemHandle_t
 .. autoclass:: cuda.bindings.runtime.cudaMemFabricHandle_t
+.. autoclass:: cuda.bindings.runtime.cudaDevSmResourceGroupParams
+.. autoclass:: cuda.bindings.runtime.cudaDevResource
 .. autoclass:: cuda.bindings.runtime.cudaStream_t
 .. autoclass:: cuda.bindings.runtime.cudaEvent_t
 .. autoclass:: cuda.bindings.runtime.cudaGraphicsResource_t
@@ -5761,6 +6127,7 @@ Data types used by CUDA Runtime
 
     When /p flags of :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` is set to this, it indicates that application need waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cudaDeviceGetNvSciSyncAttributes`.
 
+.. autoattribute:: cuda.bindings.runtime.RESOURCE_ABI_BYTES
 .. autoattribute:: cuda.bindings.runtime.cudaGraphKernelNodePortDefault
 
     This port activates when the kernel has finished executing.
diff --git a/cuda_bindings/docs/source/release/12.9.Y-notes.rst b/cuda_bindings/docs/source/release/12.9.5-notes.rst
similarity index 63%
rename from cuda_bindings/docs/source/release/12.9.Y-notes.rst
rename to cuda_bindings/docs/source/release/12.9.5-notes.rst
index 7dcee50d31..955a212c34 100644
--- a/cuda_bindings/docs/source/release/12.9.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/12.9.5-notes.rst
@@ -3,7 +3,7 @@
 
 .. module:: cuda.bindings
 
-``cuda-bindings`` 12.9.Y Release notes
+``cuda-bindings`` 12.9.5 Release notes
 ======================================
 
 
@@ -11,14 +11,16 @@
 Highlights
 ----------
 
+* Added ``__cuda_stream__`` protocol support to ``driver.CUStream`` class, enabling better interoperability with libraries that expect this protocol.
+* Python 3.9 support was dropped (end of life).
 
 Bug fixes
 ---------
 
-
+* Fixed ``cuStreamBeginCaptureToGraph`` to allow the ``dependencyData`` argument to be optional, matching the underlying CUDA API behavior.
 
 Known issues
 ------------
 
 * Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
-* The graphics APIs in ``cuda.bindings.runtime`` are inadvertently disabled in 12.9.3.  Users needing these APIs should update to 12.9.4.
+* The graphics APIs in ``cuda.bindings.runtime`` are inadvertently disabled in 12.9.3.  Users needing these APIs should update to 12.9.4 or higher.
diff --git a/cuda_bindings/docs/source/release/13.X.Y-notes.rst b/cuda_bindings/docs/source/release/13.1.0-notes.rst
similarity index 53%
rename from cuda_bindings/docs/source/release/13.X.Y-notes.rst
rename to cuda_bindings/docs/source/release/13.1.0-notes.rst
index 00278667ec..5da7659ba3 100644
--- a/cuda_bindings/docs/source/release/13.X.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/13.1.0-notes.rst
@@ -3,18 +3,27 @@
 
 .. module:: cuda.bindings
 
-``cuda-bindings`` 13.X.Y Release notes
+``cuda-bindings`` 13.1.0 Release notes
 ======================================
 
+Released on Dec 4, 2025
 
 
 Highlights
 ----------
 
+* Add support for new APIs introduced in CUDA 13.1.
+* The ``cufile.read`` and ``cufile.write`` functions now return the number of bytes read or written (``ssize_t``) instead of returning ``None``, providing better feedback on I/O operations.
+* Improved performance of cuFile bindings with approximately 60x faster construction and member access for generated low-level classes.
+* Added ``__cuda_stream__`` protocol support to ``driver.CUStream`` class, enabling better interoperability with libraries that expect this protocol.
+* Upgraded Cython dependency to version 3.2.
+* Python 3.9 support was dropped (end of life).
 
 Bug fixes
 ---------
 
+* Fixed ``cuStreamBeginCaptureToGraph`` to allow the ``dependencyData`` argument to be optional, matching the underlying CUDA API behavior.
+
 Backward incompatible changes
 -----------------------------
 
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 03e0ece4d0..f6d0ac84ef 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -199,6 +199,9 @@ def parse_headers(header_dict):
             if discovered:
                 found_struct += discovered
 
+    # TODO(#1312): make this work properly
+    found_types.append("CUstreamAtomicReductionDataType_enum")
+
     return found_types, found_functions, found_values, found_struct, struct_list
 
 
diff --git a/cuda_bindings/tests/cython/build_tests.bat b/cuda_bindings/tests/cython/build_tests.bat
index fda860506e..e1bf73af17 100644
--- a/cuda_bindings/tests/cython/build_tests.bat
+++ b/cuda_bindings/tests/cython/build_tests.bat
@@ -5,5 +5,6 @@ REM SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 setlocal
 	set CL=%CL% /I"%CUDA_HOME%\include"
-	cythonize -3 -i -Xfreethreading_compatible=True %~dp0test_*.pyx
+	REM Use -j 1 to side-step any process-pool issues and ensure deterministic single-threaded builds
+	cythonize -3 -j 1 -i -Xfreethreading_compatible=True %~dp0test_*.pyx
 endlocal
diff --git a/cuda_bindings/tests/cython/build_tests.sh b/cuda_bindings/tests/cython/build_tests.sh
index 1e08f35955..c2ddc9ea79 100755
--- a/cuda_bindings/tests/cython/build_tests.sh
+++ b/cuda_bindings/tests/cython/build_tests.sh
@@ -14,4 +14,5 @@ else
   exit 1
 fi
 
-cythonize -3 -i -Xfreethreading_compatible=True ${SCRIPTPATH}/test_*.pyx
+# Use -j 1 to side-step any process-pool issues and ensure deterministic single-threaded builds
+cythonize -3 -j 1 -i -Xfreethreading_compatible=True ${SCRIPTPATH}/test_*.pyx
diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 1f4735ba38..e5a40694eb 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -15,6 +15,8 @@
 import cuda.bindings.driver as cuda
 import pytest
 
+cufile = pytest.importorskip("cuda.bindings.cufile")
+
 # Configure logging to show INFO level and above
 logging.basicConfig(
     level=logging.INFO,
@@ -22,12 +24,13 @@
     force=True,  # Override any existing logging configuration
 )
 
-try:
-    from cuda.bindings import cufile
-except ImportError:
-    cufile = cuFileError = None
-else:
-    from cuda.bindings.cufile import cuFileError
+
+def platform_is_tegra_linux():
+    return pathlib.Path("/etc/nv_tegra_release").exists()
+
+
+if platform_is_tegra_linux():
+    pytest.skip("skipping cuFile tests on Tegra Linux", allow_module_level=True)
 
 
 def platform_is_wsl():
@@ -35,13 +38,13 @@ def platform_is_wsl():
     return platform.system() == "Linux" and "microsoft" in pathlib.Path("/proc/version").read_text().lower()
 
 
-if cufile is None:
-    pytest.skip("skipping tests on Windows", allow_module_level=True)
-
 if platform_is_wsl():
     pytest.skip("skipping cuFile tests on WSL", allow_module_level=True)
 
 
+from cuda.bindings.cufile import cuFileError
+
+
 @pytest.fixture
 def cufile_env_json():
     """Set CUFILE_ENV_PATH_JSON environment variable for async tests."""
@@ -1445,9 +1448,14 @@ def test_param(param, val):
         assert retrieved_val is val
         cufile.set_parameter_bool(param, orig_val)
 
-    # Test setting and getting various boolean parameters
-    for param, val in param_val_pairs:
-        test_param(param, val)
+    try:
+        # Test setting and getting various boolean parameters
+        for param, val in param_val_pairs:
+            test_param(param, val)
+    except cufile.cuFileError:
+        if cufile.get_version() < 1160:
+            raise
+        assert param is cufile.BoolConfigParameter.PROFILE_NVTX  # Deprecated in CTK 13.1.0
 
 
 @pytest.mark.skipif(
diff --git a/cuda_bindings/tests/test_utils.py b/cuda_bindings/tests/test_utils.py
index 4c5751b266..3097c93974 100644
--- a/cuda_bindings/tests/test_utils.py
+++ b/cuda_bindings/tests/test_utils.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import platform
+import importlib
 import random
 import subprocess
 import sys
@@ -11,6 +11,8 @@
 from cuda.bindings import driver, runtime
 from cuda.bindings.utils import get_cuda_native_handle, get_minimal_required_cuda_ver_from_ptx_ver, get_ptx_ver
 
+have_cufile = importlib.util.find_spec("cuda.bindings.cufile") is not None
+
 ptx_88_kernel = r"""
 .version 8.8
 .target sm_75
@@ -101,7 +103,7 @@ def test_get_handle_error(target):
         "nvrtc",
         "nvvm",
         "runtime",
-        *(["cufile"] if platform.system() != "Windows" else []),
+        *(["cufile"] if have_cufile else []),
     ],
 )
 def test_cyclical_imports(module):
diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
index c961e82ac5..a025fc044c 100644
--- a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
@@ -7,7 +7,7 @@
 # Also update the CUDA Toolkit version number below.
 
 # ruff: noqa: E501
-# CUDA Toolkit v13.0.0
+# CUDA Toolkit v13.1.0
 DRIVER_CU_RESULT_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
@@ -347,5 +347,10 @@
     914: ("This error indicates one or more resources passed in are not valid resource types for the operation."),
     915: ("This error indicates one or more resources are insufficient or non-applicable for the operation."),
     916: ("This error indicates that an error happened during the key rotation sequence."),
+    917: (
+        "This error indicates that the requested operation is not permitted because the"
+        " stream is in a detached state. This can occur if the green context associated"
+        " with the stream has been destroyed, limiting the stream's operational capabilities."
+    ),
     999: "This indicates that an unknown internal error has occurred.",
 }
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
index 126897f2b5..dd73e36cc8 100644
--- a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
@@ -7,7 +7,7 @@
 # Also update the CUDA Toolkit version number below.
 
 # ruff: noqa: E501
-# CUDA Toolkit v13.0.0
+# CUDA Toolkit v13.1.0
 RUNTIME_CUDA_ERROR_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
@@ -530,6 +530,11 @@
     913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
     914: ("This error indicates one or more resources passed in are not valid resource types for the operation."),
     915: ("This error indicates one or more resources are insufficient or non-applicable for the operation."),
+    917: (
+        "This error indicates that the requested operation is not permitted because the"
+        " stream is in a detached state. This can occur if the green context associated"
+        " with the stream has been destroyed, limiting the stream's operational capabilities."
+    ),
     999: "This indicates that an unknown internal error has occurred.",
     10000: (
         "Any unhandled CUDA driver error is added to this value and returned via"
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
index 0997cff4cf..20260a5266 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
@@ -89,51 +89,38 @@
     "cutensorMg": ("cutensor", "cublasLt"),
 }
 
-# Based on these released files:
-#   cuda_11.0.3_450.51.06_linux.run
-#   cuda_11.1.1_455.32.00_linux.run
-#   cuda_11.2.2_460.32.03_linux.run
-#   cuda_11.3.1_465.19.01_linux.run
-#   cuda_11.4.4_470.82.01_linux.run
-#   cuda_11.5.1_495.29.05_linux.run
-#   cuda_11.6.2_510.47.03_linux.run
-#   cuda_11.7.1_515.65.01_linux.run
-#   cuda_11.8.0_520.61.05_linux.run
+# Based on these files:
 #   cuda_12.0.1_525.85.12_linux.run
 #   cuda_12.1.1_530.30.02_linux.run
 #   cuda_12.2.2_535.104.05_linux.run
 #   cuda_12.3.2_545.23.08_linux.run
 #   cuda_12.4.1_550.54.15_linux.run
 #   cuda_12.5.1_555.42.06_linux.run
-#   cuda_12.6.2_560.35.03_linux.run
+#   cuda_12.6.3_560.35.05_linux.run
 #   cuda_12.8.1_570.124.06_linux.run
 #   cuda_12.9.1_575.57.08_linux.run
-#   cuda_13.0.0_580.65.06_linux.run
+#   cuda_13.0.2_580.95.05_linux.run
+#   cuda_13.1.0_590.44.01_linux.run
 # Generated with toolshed/build_pathfinder_sonames.py
 # Please keep in old → new sort order.
 SUPPORTED_LINUX_SONAMES_CTK = {
     "cublas": (
-        "libcublas.so.11",
         "libcublas.so.12",
         "libcublas.so.13",
     ),
     "cublasLt": (
-        "libcublasLt.so.11",
         "libcublasLt.so.12",
         "libcublasLt.so.13",
     ),
     "cudart": (
-        "libcudart.so.11.0",
         "libcudart.so.12",
         "libcudart.so.13",
     ),
     "cufft": (
-        "libcufft.so.10",
         "libcufft.so.11",
         "libcufft.so.12",
     ),
     "cufftw": (
-        "libcufftw.so.10",
         "libcufftw.so.11",
         "libcufftw.so.12",
     ),
@@ -141,71 +128,55 @@
     # "cufile_rdma": ("libcufile_rdma.so.1",),
     "curand": ("libcurand.so.10",),
     "cusolver": (
-        "libcusolver.so.10",
         "libcusolver.so.11",
         "libcusolver.so.12",
     ),
     "cusolverMg": (
-        "libcusolverMg.so.10",
         "libcusolverMg.so.11",
         "libcusolverMg.so.12",
     ),
-    "cusparse": (
-        "libcusparse.so.11",
-        "libcusparse.so.12",
-    ),
+    "cusparse": ("libcusparse.so.12",),
     "nppc": (
-        "libnppc.so.11",
         "libnppc.so.12",
         "libnppc.so.13",
     ),
     "nppial": (
-        "libnppial.so.11",
         "libnppial.so.12",
         "libnppial.so.13",
     ),
     "nppicc": (
-        "libnppicc.so.11",
         "libnppicc.so.12",
         "libnppicc.so.13",
     ),
     "nppidei": (
-        "libnppidei.so.11",
         "libnppidei.so.12",
         "libnppidei.so.13",
     ),
     "nppif": (
-        "libnppif.so.11",
         "libnppif.so.12",
         "libnppif.so.13",
     ),
     "nppig": (
-        "libnppig.so.11",
         "libnppig.so.12",
         "libnppig.so.13",
     ),
     "nppim": (
-        "libnppim.so.11",
         "libnppim.so.12",
         "libnppim.so.13",
     ),
     "nppist": (
-        "libnppist.so.11",
         "libnppist.so.12",
         "libnppist.so.13",
     ),
     "nppisu": (
-        "libnppisu.so.11",
         "libnppisu.so.12",
         "libnppisu.so.13",
     ),
     "nppitc": (
-        "libnppitc.so.11",
         "libnppitc.so.12",
         "libnppitc.so.13",
     ),
     "npps": (
-        "libnpps.so.11",
         "libnpps.so.12",
         "libnpps.so.13",
     ),
@@ -214,7 +185,6 @@
         "libnvJitLink.so.13",
     ),
     "nvblas": (
-        "libnvblas.so.11",
         "libnvblas.so.12",
         "libnvblas.so.13",
     ),
@@ -223,21 +193,14 @@
         "libnvfatbin.so.13",
     ),
     "nvjpeg": (
-        "libnvjpeg.so.11",
         "libnvjpeg.so.12",
         "libnvjpeg.so.13",
     ),
     "nvrtc": (
-        "libnvrtc.so.11.0",
-        "libnvrtc.so.11.1",
-        "libnvrtc.so.11.2",
         "libnvrtc.so.12",
         "libnvrtc.so.13",
     ),
-    "nvvm": (
-        "libnvvm.so.3",
-        "libnvvm.so.4",
-    ),
+    "nvvm": ("libnvvm.so.4",),
 }
 SUPPORTED_LINUX_SONAMES_OTHER = {
     "cublasmp": ("libcublasmp.so.0",),
@@ -253,123 +216,92 @@
 }
 SUPPORTED_LINUX_SONAMES = SUPPORTED_LINUX_SONAMES_CTK | SUPPORTED_LINUX_SONAMES_OTHER
 
-# Based on these released files:
-#   cuda_11.0.3_451.82_win10.exe
-#   cuda_11.1.1_456.81_win10.exe
-#   cuda_11.2.2_461.33_win10.exe
-#   cuda_11.3.1_465.89_win10.exe
-#   cuda_11.4.4_472.50_windows.exe
-#   cuda_11.5.1_496.13_windows.exe
-#   cuda_11.6.2_511.65_windows.exe
-#   cuda_11.7.1_516.94_windows.exe
-#   cuda_11.8.0_522.06_windows.exe
+# Based on these files:
 #   cuda_12.0.1_528.33_windows.exe
 #   cuda_12.1.1_531.14_windows.exe
 #   cuda_12.2.2_537.13_windows.exe
 #   cuda_12.3.2_546.12_windows.exe
 #   cuda_12.4.1_551.78_windows.exe
 #   cuda_12.5.1_555.85_windows.exe
-#   cuda_12.6.2_560.94_windows.exe
+#   cuda_12.6.3_561.17_windows.exe
 #   cuda_12.8.1_572.61_windows.exe
 #   cuda_12.9.1_576.57_windows.exe
-#   cuda_13.0.0_windows.exe
+#   cuda_13.0.2_windows.exe
+#   cuda_13.1.0_windows.exe
 # Generated with toolshed/build_pathfinder_dlls.py
 # Please keep in old → new sort order.
 SUPPORTED_WINDOWS_DLLS_CTK = {
     "cublas": (
-        "cublas64_11.dll",
         "cublas64_12.dll",
         "cublas64_13.dll",
     ),
     "cublasLt": (
-        "cublasLt64_11.dll",
         "cublasLt64_12.dll",
         "cublasLt64_13.dll",
     ),
     "cudart": (
-        "cudart64_101.dll",
-        "cudart64_110.dll",
         "cudart64_12.dll",
         "cudart64_13.dll",
-        "cudart64_65.dll",
     ),
     "cufft": (
-        "cufft64_10.dll",
         "cufft64_11.dll",
         "cufft64_12.dll",
     ),
     "cufftw": (
-        "cufftw64_10.dll",
         "cufftw64_11.dll",
         "cufftw64_12.dll",
     ),
     "curand": ("curand64_10.dll",),
     "cusolver": (
-        "cusolver64_10.dll",
         "cusolver64_11.dll",
         "cusolver64_12.dll",
     ),
     "cusolverMg": (
-        "cusolverMg64_10.dll",
         "cusolverMg64_11.dll",
         "cusolverMg64_12.dll",
     ),
-    "cusparse": (
-        "cusparse64_11.dll",
-        "cusparse64_12.dll",
-    ),
+    "cusparse": ("cusparse64_12.dll",),
     "nppc": (
-        "nppc64_11.dll",
         "nppc64_12.dll",
         "nppc64_13.dll",
     ),
     "nppial": (
-        "nppial64_11.dll",
         "nppial64_12.dll",
         "nppial64_13.dll",
     ),
     "nppicc": (
-        "nppicc64_11.dll",
         "nppicc64_12.dll",
         "nppicc64_13.dll",
     ),
     "nppidei": (
-        "nppidei64_11.dll",
         "nppidei64_12.dll",
         "nppidei64_13.dll",
     ),
     "nppif": (
-        "nppif64_11.dll",
         "nppif64_12.dll",
         "nppif64_13.dll",
     ),
     "nppig": (
-        "nppig64_11.dll",
         "nppig64_12.dll",
         "nppig64_13.dll",
     ),
     "nppim": (
-        "nppim64_11.dll",
         "nppim64_12.dll",
         "nppim64_13.dll",
     ),
     "nppist": (
-        "nppist64_11.dll",
         "nppist64_12.dll",
         "nppist64_13.dll",
     ),
     "nppisu": (
-        "nppisu64_11.dll",
         "nppisu64_12.dll",
         "nppisu64_13.dll",
     ),
     "nppitc": (
-        "nppitc64_11.dll",
         "nppitc64_12.dll",
         "nppitc64_13.dll",
     ),
     "npps": (
-        "npps64_11.dll",
         "npps64_12.dll",
         "npps64_13.dll",
     ),
@@ -378,7 +310,6 @@
         "nvJitLink_130_0.dll",
     ),
     "nvblas": (
-        "nvblas64_11.dll",
         "nvblas64_12.dll",
         "nvblas64_13.dll",
     ),
@@ -387,20 +318,15 @@
         "nvfatbin_130_0.dll",
     ),
     "nvjpeg": (
-        "nvjpeg64_11.dll",
         "nvjpeg64_12.dll",
         "nvjpeg64_13.dll",
     ),
     "nvrtc": (
-        "nvrtc64_110_0.dll",
-        "nvrtc64_111_0.dll",
-        "nvrtc64_112_0.dll",
         "nvrtc64_120_0.dll",
         "nvrtc64_130_0.dll",
     ),
     "nvvm": (
         "nvvm64.dll",
-        "nvvm64_33_0.dll",
         "nvvm64_40_0.dll",
         "nvvm70.dll",
     ),
diff --git a/cuda_pathfinder/cuda/pathfinder/_version.py b/cuda_pathfinder/cuda/pathfinder/_version.py
index 7c72217355..e7234fe02a 100644
--- a/cuda_pathfinder/cuda/pathfinder/_version.py
+++ b/cuda_pathfinder/cuda/pathfinder/_version.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.3.3a0"
+__version__ = "1.3.3"
diff --git a/cuda_pathfinder/docs/nv-versions.json b/cuda_pathfinder/docs/nv-versions.json
index 2bec723d4b..77167c9149 100644
--- a/cuda_pathfinder/docs/nv-versions.json
+++ b/cuda_pathfinder/docs/nv-versions.json
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/"
     },
+    {
+        "version": "1.3.3",
+        "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.3.3/"
+    },
     {
         "version": "1.3.2",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.3.2/"
diff --git a/cuda_pathfinder/docs/source/release/1.3.3-notes.rst b/cuda_pathfinder/docs/source/release/1.3.3-notes.rst
new file mode 100644
index 0000000000..c0bdbf901f
--- /dev/null
+++ b/cuda_pathfinder/docs/source/release/1.3.3-notes.rst
@@ -0,0 +1,21 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. py:currentmodule:: cuda.pathfinder
+
+``cuda-pathfinder`` 1.3.3 Release notes
+=======================================
+
+Released on Dec 4, 2025
+
+Highlights
+----------
+
+* Add cuSPARSELt  support (`PR #1200 <https://github.com/NVIDIA/cuda-python/pull/1200>`_)
+
+* Purge support for CTK 11 (those became unsupported with the CTK 13.0 release in August 2025).
+  (`PR #1315 <https://github.com/NVIDIA/cuda-python/pull/1315>`_)
+
+* Remove ``cudart64_65.dll`` and ``cudart64_101.dll`` from supported_nvidia_libs.py;
+  these outdated DLLs were included accidentally.
+  (`PR #1315 <https://github.com/NVIDIA/cuda-python/pull/1315>`_)
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index b3b9ed2d0c..d446c1a60d 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -33,6 +33,7 @@ cu13 = [
     "cutensor-cu13",
     "nvidia-cublasmp-cu13; sys_platform != 'win32'",
     "nvidia-cudss-cu13",
+    "nvidia-cufftmp-cu13; sys_platform != 'win32'",
     "nvidia-cusparselt-cu13",
     "nvidia-nccl-cu13; sys_platform != 'win32'",
     "nvidia-nvshmem-cu13; sys_platform != 'win32'",
diff --git a/toolshed/build_pathfinder_dlls.py b/toolshed/build_pathfinder_dlls.py
index e44e29dc15..d9d6f1082a 100755
--- a/toolshed/build_pathfinder_dlls.py
+++ b/toolshed/build_pathfinder_dlls.py
@@ -49,6 +49,12 @@ def is_suppressed_dll(libname, dll):
     if libname == "cudart":
         if dll.startswith("cudart32_"):
             return True
+        if dll == "cudart64_65.dll":
+            # PhysX/files/Common/cudart64_65.dll from CTK 6.5, but shipped with CTK 12.0-12.9
+            return True
+        if dll == "cudart64_101.dll":
+            # GFExperience.NvStreamSrv/amd64/server/cudart64_101.dll from CTK 10.1, but shipped with CTK 12.0-12.6
+            return True
     elif libname == "nvrtc":
         if dll.endswith(".alt.dll"):
             return True