NVIDIA · bettinaheim · Oct 8, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
@@ -75,41 +75,6 @@
                 "cuda_distribution": "fedora42"
             }
         },
-        {
-            "platform": "linux/amd64",
-            "cuda_major": 11,
-            "operating_systems": 
-            [
-                "ubuntu:22.04",
-                "redhat/ubi8:8.10",
-                "opensuse/leap:15.5",
-                "fedora:42"
-            ],
-            "ubuntu:22.04": 
-            {
-                "libcdev_package": "libc6-dev",
-                "cudart_version": "11.8",
-                "cuda_distribution": "ubuntu2204"
-            },
-            "redhat/ubi8:8.10":
-            {
-                "libcdev_package": "glibc-devel",
-                "cudart_version": "11.8",
-                "cuda_distribution": "rhel9"
-            },
-            "opensuse/leap:15.5": 
-            {
-                "libcdev_package": "glibc-devel",
-                "cudart_version": "11.8",
-                "cuda_distribution": "opensuse15"
-            },
-            "fedora:42": 
-            {
-                "libcdev_package": "glibc-devel",
-                "cudart_version": "11.8",
-                "cuda_distribution": "rhel9"
-            }
-        },
         {
             "platform": "linux/arm64",
             "cuda_major": 12,
@@ -130,34 +95,6 @@
                 "cudart_version": "12.6",
                 "cuda_distribution": "rhel9"
             }
-        },
-        {
-            "platform": "linux/arm64",
-            "cuda_major": 11,
-            "operating_systems": 
-            [
-                "ubuntu:22.04",
-                "redhat/ubi8:8.10",
-                "fedora:42"
-            ],
-            "ubuntu:22.04": 
-            { 
-                "libcdev_package": "libc6-dev",
-                "cudart_version": "11.8",
-                "cuda_distribution": "ubuntu2204"
-            },
-            "redhat/ubi8:8.10":
-            {
-                "libcdev_package": "glibc-devel",
-                "cudart_version": "11.8",
-                "cuda_distribution": "rhel9"
-            },
-            "fedora:42": 
-            {
-                "libcdev_package": "glibc-devel",
-                "cudart_version": "11.8",
-                "cuda_distribution": "rhel9"
-            }
         }
     ]
 }
diff --git a/docker/build/devdeps.ext.Dockerfile b/docker/build/devdeps.ext.Dockerfile
@@ -134,7 +134,7 @@ ENV UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm
 
 # Install CUDA
 
-ARG cuda_packages="cuda-cudart cuda-nvrtc cuda-compiler libcublas libcublas-dev libcurand-dev libcusolver libcusparse-dev libnvjitlink"
+ARG cuda_packages="cuda-cudart cuda-nvrtc cuda-compiler libcublas libcublas-dev libcurand-dev libcusolver libcusparse-dev libnvjitlink cuda-nvml-dev"
 RUN if [ -n "$cuda_packages" ]; then \
         # Filter out libnvjitlink if CUDA version is less than 12
         if [ $(echo $CUDA_VERSION | cut -d "." -f1) -lt 12 ]; then \
@@ -175,7 +175,7 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN apt-get update && apt-get install -y --no-install-recommends \
         python3 python3-pip && \
     apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==13.4.1 cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==25.06 && \
+    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==13.4.1 cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==25.09 && \
     if [ "$(python3 --version | grep -o [0-9\.]* | cut -d . -f -2)" != "3.12" ]; then \
         echo "expecting Python version 3.12"; \
     fi

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ requires-python = ">=3.10"
 license = { file="LICENSE" }
 dependencies = [
   'astpretty ~= 3.0',
-  'cuquantum-cu12 == 25.06',
+  'cuquantum-cu12 == 25.09',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.31',

diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -62,8 +62,10 @@ set(CUTENSORNET_PATCH ${CMAKE_MATCH_1})
 
 set(CUTENSORNET_VERSION ${CUTENSORNET_MAJOR}.${CUTENSORNET_MINOR}.${CUTENSORNET_PATCH})
 message(STATUS "Found cutensornet version: ${CUTENSORNET_VERSION}")
-# We need cutensornet v2.7.0+ (cutensornetStateApplyGeneralChannel)
-if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.7")
+# We need cutensornet v2.9.0+
+# Using the new flow: define a network with cutensornetCreateNetwork, append inputs via cutensornetNetworkAppendTensor, set output with cutensornetNetworkSetOutputTensor, 
+# then prepare and run using cutensornetNetworkPrepareContraction and cutensornetNetworkContract.
+if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.9")
   set (BASE_TENSOR_BACKEND_SRS tensornet_utils.cpp)
   get_filename_component(CUTENSORNET_INCLUDE_DIR ${CUTENSORNET_INC} DIRECTORY)
   get_filename_component(CUTENSORNET_LIB_DIR ${CUTENSORNET_LIB} DIRECTORY)
@@ -93,5 +95,5 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.7")
   target_link_libraries(nvqir-tensornet PRIVATE tensornet-mpi-util)
   target_link_libraries(nvqir-tensornet-fp32 PRIVATE tensornet-mpi-util)
 else()
-  message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.")
+  message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.9.0+.")
 endif()
diff --git a/runtime/nvqir/cutensornet/mps_simulation_state.inc b/runtime/nvqir/cutensornet/mps_simulation_state.inc
@@ -145,10 +145,26 @@ std::complex<double> MPSSimulationState<ScalarType>::computeOverlap(
   }
 
   cutensornetNetworkDescriptor_t m_tnDescr;
-  HANDLE_CUTN_ERROR(cutensornetCreateNetworkDescriptor(
-      cutnHandle, numTensors, numModes.data(), extentsIn.data(), NULL,
-      modesIn.data(), tensAttr.data(), 0, NULL, NULL, NULL, dataType,
-      computeType, &m_tnDescr));
+  // Set up tensor network
+  HANDLE_CUTN_ERROR(cutensornetCreateNetwork(cutnHandle, &m_tnDescr));
+
+  int64_t tensorIDs[numTensors]; // for input tensors
+
+  // attach the input tensors to the network
+  for (int32_t t = 0; t < numTensors; ++t) {
+    HANDLE_CUTN_ERROR(cutensornetNetworkAppendTensor(
+        cutnHandle, m_tnDescr, tensModes[t].size(), tensExtents[t].data(),
+        tensModes[t].data(), &tensAttr[t], dataType, &tensorIDs[t]));
+  }
+
+  // set the output tensor
+  HANDLE_CUTN_ERROR(cutensornetNetworkSetOutputTensor(cutnHandle, m_tnDescr, 0,
+                                                      NULL, dataType));
+
+  // set the network compute type
+  HANDLE_CUTN_ERROR(cutensornetNetworkSetAttribute(
+      cutnHandle, m_tnDescr, CUTENSORNET_NETWORK_COMPUTE_TYPE, &computeType,
+      sizeof(computeType)));
 
   cutensornetContractionOptimizerConfig_t m_tnConfig;
 
@@ -188,11 +204,10 @@ std::complex<double> MPSSimulationState<ScalarType>::computeOverlap(
       cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
       CUTENSORNET_WORKSPACE_SCRATCH, scratchPad.d_scratch,
       requiredWorkspaceSize));
-  cutensornetContractionPlan_t m_tnPlan;
   {
-    ScopedTraceWithContext("cutensornetCreateContractionPlan");
-    HANDLE_CUTN_ERROR(cutensornetCreateContractionPlan(
-        cutnHandle, m_tnDescr, m_tnPath, workDesc, &m_tnPlan));
+    ScopedTraceWithContext("cutensornetNetworkPrepareContraction");
+    HANDLE_CUTN_ERROR(
+        cutensornetNetworkPrepareContraction(cutnHandle, m_tnDescr, workDesc));
   }
   // Compute the unnormalized overlap
   std::vector<const void *> rawDataIn(numTensors);
@@ -202,11 +217,19 @@ std::complex<double> MPSSimulationState<ScalarType>::computeOverlap(
   }
   void *m_dOverlap{nullptr};
   HANDLE_CUDA_ERROR(cudaMalloc(&m_dOverlap, overlapSize));
+
+  // Set tensor's data buffers and strides
+  for (int32_t t = 0; t < numTensors; ++t) {
+    HANDLE_CUTN_ERROR(cutensornetNetworkSetInputTensorMemory(
+        cutnHandle, m_tnDescr, tensorIDs[t], rawDataIn[t], NULL));
+  }
+  HANDLE_CUTN_ERROR(cutensornetNetworkSetOutputTensorMemory(
+      cutnHandle, m_tnDescr, m_dOverlap, NULL));
+
   {
-    ScopedTraceWithContext("cutensornetContractSlices");
-    HANDLE_CUTN_ERROR(cutensornetContractSlices(cutnHandle, m_tnPlan,
-                                                rawDataIn.data(), m_dOverlap, 0,
-                                                workDesc, NULL, 0x0));
+    ScopedTraceWithContext("cutensornetNetworkContract");
+    HANDLE_CUTN_ERROR(cutensornetNetworkContract(cutnHandle, m_tnDescr, 0,
+                                                 workDesc, NULL, 0x0));
   }
   // Get the overlap value back to Host
   std::complex<double> overlap = 0.0;
@@ -224,10 +247,9 @@ std::complex<double> MPSSimulationState<ScalarType>::computeOverlap(
 
   // Clean up
   HANDLE_CUDA_ERROR(cudaFree(m_dOverlap));
-  HANDLE_CUTN_ERROR(cutensornetDestroyContractionPlan(m_tnPlan));
   HANDLE_CUTN_ERROR(cutensornetDestroyContractionOptimizerInfo(m_tnPath));
   HANDLE_CUTN_ERROR(cutensornetDestroyContractionOptimizerConfig(m_tnConfig));
-  HANDLE_CUTN_ERROR(cutensornetDestroyNetworkDescriptor(m_tnDescr));
+  HANDLE_CUTN_ERROR(cutensornetDestroyNetwork(m_tnDescr));
 
   return std::abs(overlap);
 }

diff --git a/scripts/configure_build.sh b/scripts/configure_build.sh
@@ -37,6 +37,9 @@ if [ "$1" == "install-cuda" ]; then
     dnf config-manager --add-repo "${CUDA_DOWNLOAD_URL}/${DISTRIBUTION}/${CUDA_ARCH_FOLDER}/cuda-${DISTRIBUTION}.repo"
     dnf install -y --nobest --setopt=install_weak_deps=False \
         cuda-toolkit-$(echo ${CUDA_VERSION} | tr . -)
+    # custatevec is now linked to `libnvidia-ml.so.1`, which is provided in the NVIDIA driver.
+    # For build on non-GPU systems, we also need to install the driver. 
+    dnf install -y --nobest --setopt=install_weak_deps=False nvidia-driver-libs    
 # [<CUDAInstall]
 fi
 
@@ -72,7 +75,7 @@ if [ "$1" == "install-cuquantum" ]; then
     CUDA_ARCH_FOLDER=$([ "$(uname -m)" == "aarch64" ] && echo sbsa || echo x86_64)
 
 # [>cuQuantumInstall]
-    CUQUANTUM_VERSION=25.06.0.10
+    CUQUANTUM_VERSION=25.09.0.7
     CUQUANTUM_DOWNLOAD_URL=https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum
 
     cuquantum_archive=cuquantum-linux-${CUDA_ARCH_FOLDER}-${CUQUANTUM_VERSION}_cuda$(echo ${CUDA_VERSION} | cut -d . -f1)-archive.tar.xz
@@ -88,14 +91,13 @@ if [ "$1" == "install-cutensor" ]; then
     CUDA_ARCH_FOLDER=$([ "$(uname -m)" == "aarch64" ] && echo sbsa || echo x86_64)
 
 # [>cuTensorInstall]
-    CUTENSOR_VERSION=2.2.0.0
+    CUTENSOR_VERSION=2.3.1.0
     CUTENSOR_DOWNLOAD_URL=https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor
 
-    cutensor_archive=libcutensor-linux-${CUDA_ARCH_FOLDER}-${CUTENSOR_VERSION}-archive.tar.xz
+    cutensor_archive=libcutensor-linux-${CUDA_ARCH_FOLDER}-${CUTENSOR_VERSION}_cuda$(echo ${CUDA_VERSION} | cut -d . -f1)-archive.tar.xz
     wget "${CUTENSOR_DOWNLOAD_URL}/linux-${CUDA_ARCH_FOLDER}/${cutensor_archive}"
     mkdir -p "${CUTENSOR_INSTALL_PREFIX}" && tar xf "${cutensor_archive}" --strip-components 1 -C "${CUTENSOR_INSTALL_PREFIX}"
-    mv "${CUTENSOR_INSTALL_PREFIX}"/lib/$(echo ${CUDA_VERSION} | cut -d . -f1)/* ${CUTENSOR_INSTALL_PREFIX}/lib/
-    ls -d ${CUTENSOR_INSTALL_PREFIX}/lib/*/ | xargs rm -rf && rm -rf "${cutensor_archive}"
+    rm -rf "${cutensor_archive}"
 # [<cuTensorInstall]
 fi
 

diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
@@ -118,6 +118,36 @@ create_tests_with_backend(dm backends/QPPDMTester.cpp)
 create_tests_with_backend(stim "")
 
 if (CUSTATEVEC_ROOT AND CUDA_FOUND)
+  find_program(NVIDIA_SMI "nvidia-smi")
+  if(${NVIDIA_SMI} STREQUAL "NVIDIA_SMI-NOTFOUND")
+    # libcustatevec.so has linkage to libnvdia-ml.so.1, which is part of NVIDIA driver.
+    # On a build system without NVIDIA GPUs, this lib cannot be resolved;
+    # hence, linking these test executables will fail.
+    # On these CPU-only build systems, we directly link the nvidia-ml-dev library, which provides
+    # stub symbols for the actual driver library, to these test executables.
+    # Running these tests will ultimately requires a system with GPUs, i.e., the proper nvidia-ml lib
+    # will be loaded by the runtime linker/loader.
+    find_package(CUDAToolkit REQUIRED)
+    # make sure CUDA_TOOLKIT_ROOT_DIR is set
+    if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
+      set(CUDA_TOOLKIT_ROOT_DIR ${CUDAToolkit_BIN_DIR}/..)
+    endif()
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+      set(NVIDIA_ML_PATH ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs)
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+      set(NVIDIA_ML_PATH ${CUDA_TOOLKIT_ROOT_DIR}/targets/sbsa-linux/lib/stubs)
+    else()
+      message(FATAL_ERROR "Neither x86_64 nor aarch64 was detected." )
+    endif()
+    # Find nvidia-ml manually
+    find_library(NVIDIA_ML NAMES nvidia-ml PATHS ${NVIDIA_ML_PATH})
+    message(STATUS "NVIDIA ML lib: ${NVIDIA_ML}")
+    # Inject the file during build and remove it after build (install)
+    file(CREATE_LINK ${NVIDIA_ML} ${CMAKE_CURRENT_BINARY_DIR}/libnvidia-ml.so.1)
+    link_directories(${CMAKE_CURRENT_BINARY_DIR})
+    install(CODE "file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/libnvidia-ml.so.1)")
+  endif()
+
   create_tests_with_backend(custatevec-fp32 "")
   # Given that the fp32 and fp64 difference is largely inherited
   # from a dependency, we omit fp64 tests here and rely on the