NVIDIA · 1tnguyen · Apr 2, 2024 · Mar 14, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -18,6 +18,8 @@ class SimulatorMPS : public SimulatorTensorNetBase {
   // Default relative cutoff
   double m_relCutoff = 1e-5;
   std::vector<void *> m_mpsTensors_d;
+  // List of auxiliary qubits that were used for controlled-gate decomposition.
+  std::vector<std::size_t> m_auxQubitsForGateDecomp;
 
 public:
   SimulatorMPS() : SimulatorTensorNetBase() {
@@ -108,6 +110,245 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     }
     m_mpsTensors_d.clear();
   }
+
+  void deallocateStateImpl() override {
+    m_auxQubitsForGateDecomp.clear();
+    SimulatorTensorNetBase::deallocateStateImpl();
+  }
+
+  /// @brief Return the state vector data
+  cudaq::State getStateData() override {
+    LOG_API_TIME();
+    if (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size() > 64)
+      throw std::runtime_error("State vector data is too large.");
+    // Handle empty state (e.g., no qubit allocation)
+    if (!m_state)
+      return cudaq::State{{0}, {}};
+    const uint64_t svDim =
+        (1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size()));
+    const std::vector<int32_t> projectedModes(m_auxQubitsForGateDecomp.begin(),
+                                              m_auxQubitsForGateDecomp.end());
+    // Returns the main qubit register state (auxiliary qubits are projected to
+    // zero state)
+    return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)};
+  }
+
+  std::vector<size_t> addAuxQubits(std::size_t n) {
+    if (m_state->isDirty())
+      throw std::runtime_error(
+          "[MPS Simulator] Unable to perform multi-control gate decomposition "
+          "due to dynamical circuits.");
+    std::vector<size_t> aux(n);
+    std::iota(aux.begin(), aux.end(), m_state->getNumQubits());
+    m_state = std::make_unique<TensorNetState>(m_state->getNumQubits() + n,
+                                               m_cutnHandle);
+    return aux;
+  }
+
+  template <typename QuantumOperation>
+  void
+  decomposeMultiControlledInstruction(const std::vector<double> &params,
+                                      const std::vector<std::size_t> &controls,
+                                      const std::vector<std::size_t> &targets) {
+    if (controls.size() <= 1) {
+      enqueueQuantumOperation<QuantumOperation>(params, controls, targets);
+      return;
+    }
+
+    // CCNOT decomposition
+    const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) {
+      enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {b});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {a});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {b});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
+    };
+
+    // Collects the given list of control qubits into the given auxiliary
+    // qubits, using all but the last qubits in the auxiliary list as scratch
+    // qubits.
+    //
+    // For example, if the controls list is 6 qubits, the auxiliary list must be
+    // 5 qubits, and the state from the 6 control qubits will be collected into
+    // the last qubit of the auxiliary array.
+    const auto collectControls = [&](const std::vector<std::size_t> &ctls,
+                                     const std::vector<std::size_t> &aux,
+                                     bool reverse = false) {
+      std::vector<std::tuple<std::size_t, std::size_t, std::size_t>> ccnotList;
+      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2)
+        ccnotList.emplace_back(
+            std::make_tuple(ctls[i], ctls[i + 1], aux[i / 2]));
+
+      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1; ++i)
+        ccnotList.emplace_back(std::make_tuple(aux[i * 2], aux[(i * 2) + 1],
+                                               aux[i + ctls.size() / 2]));
+
+      if (ctls.size() % 2 != 0)
+        ccnotList.emplace_back(std::make_tuple(
+            ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]));
+
+      if (reverse)
+        std::reverse(ccnotList.begin(), ccnotList.end());
+
+      for (const auto &[a, b, c] : ccnotList)
+        ccnot(a, b, c);
+    };
+
+    if (m_auxQubitsForGateDecomp.size() < controls.size() - 1) {
+      const auto aux =
+          addAuxQubits(controls.size() - 1 - m_auxQubitsForGateDecomp.size());
+      m_auxQubitsForGateDecomp.insert(m_auxQubitsForGateDecomp.end(),
+                                      aux.begin(), aux.end());
+    }
+
+    collectControls(controls, m_auxQubitsForGateDecomp);
+
+    // Add to the singly-controlled instruction queue
+    enqueueQuantumOperation<QuantumOperation>(
+        params, {m_auxQubitsForGateDecomp[controls.size() - 2]}, targets);
+
+    collectControls(controls, m_auxQubitsForGateDecomp, true);
+  };
+
+// Gate implementations:
+// Here, we forward all the call to the multi-control decomposition helper.
+// Decomposed gates are added to the queue.
+#define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME)                                      \
+  using CircuitSimulator::NAME;                                                \
+  void NAME(const std::vector<std::size_t> &controls,                          \
+            const std::size_t qubitIdx) override {                             \
+    decomposeMultiControlledInstruction<nvqir::NAME<double>>(                  \
+        {}, controls, std::vector<std::size_t>{qubitIdx});                     \
+  }
+
+#define CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(NAME)                            \
+  using CircuitSimulator::NAME;                                                \
+  void NAME(const double angle, const std::vector<std::size_t> &controls,      \
+            const std::size_t qubitIdx) override {                             \
+    decomposeMultiControlledInstruction<nvqir::NAME<double>>(                  \
+        {angle}, controls, std::vector<std::size_t>{qubitIdx});                \
+  }
+
+  /// @brief The X gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(x)
+  /// @brief The Y gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(y)
+  /// @brief The Z gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(z)
+  /// @brief The H gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(h)
+  /// @brief The S gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(s)
+  /// @brief The T gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(t)
+  /// @brief The Sdg gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(sdg)
+  /// @brief The Tdg gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(tdg)
+  /// @brief The RX gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rx)
+  /// @brief The RY gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(ry)
+  /// @brief The RZ gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rz)
+  /// @brief The Phase gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(r1)
+// Undef those preprocessor defines.
+#undef CIRCUIT_SIMULATOR_ONE_QUBIT
+#undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM
+
+  // Swap gate implementation
+  using CircuitSimulator::swap;
+  void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
+            const std::size_t tgtIdx) override {
+    if (ctrlBits.empty())
+      return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx);
+    // Controlled swap gate: using cnot decomposition of swap gate to perform
+    // decomposition.
+    const auto size = ctrlBits.size();
+    std::vector<std::size_t> ctls(size + 1);
+    std::copy(ctrlBits.begin(), ctrlBits.end(), ctls.begin());
+    {
+      ctls[size] = tgtIdx;
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
+    }
+    {
+      ctls[size] = srcIdx;
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {tgtIdx});
+    }
+    {
+      ctls[size] = tgtIdx;
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
+    }
+  }
+
+  // `exp-pauli` gate implementation: forward the middle-controlled Rz to the
+  // decomposition helper.
+  void applyExpPauli(double theta, const std::vector<std::size_t> &controls,
+                     const std::vector<std::size_t> &qubitIds,
+                     const cudaq::spin_op &op) override {
+    if (op.is_identity()) {
+      if (controls.empty()) {
+        // exp(i*theta*Id) is noop if this is not a controlled gate.
+        return;
+      } else {
+        // Throw an error if this exp_pauli(i*theta*Id) becomes a non-trivial
+        // gate due to control qubits.
+        // FIXME: revisit this once
+        // https://github.com/NVIDIA/cuda-quantum/issues/483 is implemented.
+        throw std::logic_error("Applying controlled global phase via exp_pauli "
+                               "of identity operator is not supported");
+      }
+    }
+    std::vector<std::size_t> qubitSupport;
+    std::vector<std::function<void(bool)>> basisChange;
+    op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
+      if (type != cudaq::pauli::I)
+        qubitSupport.push_back(qubitIds[qubitIdx]);
+
+      if (type == cudaq::pauli::Y)
+        basisChange.emplace_back([&, qubitIdx](bool reverse) {
+          rx(!reverse ? M_PI_2 : -M_PI_2, qubitIds[qubitIdx]);
+        });
+      else if (type == cudaq::pauli::X)
+        basisChange.emplace_back(
+            [&, qubitIdx](bool) { h(qubitIds[qubitIdx]); });
+    });
+
+    if (!basisChange.empty())
+      for (auto &basis : basisChange)
+        basis(false);
+
+    std::vector<std::pair<std::size_t, std::size_t>> toReverse;
+    for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
+      x({qubitSupport[i]}, qubitSupport[i + 1]);
+      toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
+    }
+
+    // Perform multi-control decomposition.
+    decomposeMultiControlledInstruction<nvqir::rz<double>>(
+        {-2.0 * theta}, controls, {qubitSupport.back()});
+
+    std::reverse(toReverse.begin(), toReverse.end());
+    for (auto &[i, j] : toReverse)
+      x({i}, j);
+
+    if (!basisChange.empty()) {
+      std::reverse(basisChange.begin(), basisChange.end());
+      for (auto &basis : basisChange)
+        basis(true);
+    }
+  }
 };
 } // end namespace nvqir
 

diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -23,19 +23,17 @@ TensorNetState::TensorNetState(std::size_t numQubits,
 void TensorNetState::applyGate(const std::vector<int32_t> &qubitIds,
                                void *gateDeviceMem, bool adjoint) {
 
-  int64_t id = 0;
   HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(
       m_cutnHandle, m_quantumState, qubitIds.size(), qubitIds.data(),
       gateDeviceMem, nullptr, /*immutable*/ 1,
-      /*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &id));
+      /*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &m_tensorId));
 }
 
 void TensorNetState::applyQubitProjector(void *proj_d, int32_t qubitIdx) {
-  int64_t id = 0;
   HANDLE_CUTN_ERROR(
       cutensornetStateApplyTensor(m_cutnHandle, m_quantumState, 1, &qubitIdx,
                                   proj_d, nullptr, /*immutable*/ 1,
-                                  /*adjoint*/ 0, /*unitary*/ 0, &id));
+                                  /*adjoint*/ 0, /*unitary*/ 0, &m_tensorId));
 }
 
 std::unordered_map<std::string, size_t>
@@ -120,24 +118,26 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
   return counts;
 }
 
-std::vector<std::complex<double>> TensorNetState::getStateVector() {
+std::vector<std::complex<double>>
+TensorNetState::getStateVector(const std::vector<int32_t> &projectedModes) {
   // Make sure that we don't overflow the memory size calculation.
   // Note: the actual limitation will depend on the system memory.
-  if (m_numQubits > 64 ||
-      (1ull << m_numQubits) >
+  if ((m_numQubits - projectedModes.size()) > 64 ||
+      (1ull << (m_numQubits - projectedModes.size())) >
           std::numeric_limits<uint64_t>::max() / sizeof(std::complex<double>))
     throw std::runtime_error(
         "Too many qubits are requested for full state vector contraction.");
   LOG_API_TIME();
   void *d_sv{nullptr};
-  const uint64_t svDim = 1ull << m_numQubits;
+  const uint64_t svDim = 1ull << (m_numQubits - projectedModes.size());
   HANDLE_CUDA_ERROR(cudaMalloc(&d_sv, svDim * sizeof(std::complex<double>)));
   ScratchDeviceMem scratchPad;
 
   // Create the quantum state amplitudes accessor
   cutensornetStateAccessor_t accessor;
-  HANDLE_CUTN_ERROR(cutensornetCreateAccessor(m_cutnHandle, m_quantumState, 0,
-                                              nullptr, nullptr, &accessor));
+  HANDLE_CUTN_ERROR(cutensornetCreateAccessor(
+      m_cutnHandle, m_quantumState, projectedModes.size(),
+      projectedModes.data(), nullptr, &accessor));
 
   const int32_t numHyperSamples =
       8; // desired number of hyper samples used in the tensor network
@@ -167,9 +167,11 @@ std::vector<std::complex<double>> TensorNetState::getStateVector() {
 
   // Compute the quantum state amplitudes
   std::complex<double> stateNorm{0.0, 0.0};
-  HANDLE_CUTN_ERROR(
-      cutensornetAccessorCompute(m_cutnHandle, accessor, nullptr, workDesc,
-                                 d_sv, static_cast<void *>(&stateNorm), 0));
+  // All projected modes are assumed to be projected to 0.
+  std::vector<int64_t> projectedModeValues(projectedModes.size(), 0);
+  HANDLE_CUTN_ERROR(cutensornetAccessorCompute(
+      m_cutnHandle, accessor, projectedModeValues.data(), workDesc, d_sv,
+      static_cast<void *>(&stateNorm), 0));
   std::vector<std::complex<double>> h_sv(svDim);
   HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv,
                                svDim * sizeof(std::complex<double>),

diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -13,12 +13,18 @@
 #include <unordered_map>
 
 namespace nvqir {
+/// This is used to track whether the tensor state is default initialized vs
+/// already has some gates applied to.
+constexpr std::int64_t InvalidTensorIndexValue = -1;
+
 /// @brief Wrapper of cutensornetState_t to provide convenient API's for CUDAQ
 /// simulator implementation.
 class TensorNetState {
   std::size_t m_numQubits;
   cutensornetHandle_t m_cutnHandle;
   cutensornetState_t m_quantumState;
+  /// Track id of gate tensors that are applied to the state tensors.
+  std::int64_t m_tensorId = InvalidTensorIndexValue;
 
 public:
   /// @brief Constructor
@@ -45,7 +51,8 @@ class TensorNetState {
 
   /// @brief Contract the tensor network representation to retrieve the state
   /// vector.
-  std::vector<std::complex<double>> getStateVector();
+  std::vector<std::complex<double>>
+  getStateVector(const std::vector<int32_t> &projectedModes = {});
 
   /// @brief Compute the reduce density matrix on a set of qubits
   ///
@@ -74,6 +81,9 @@ class TensorNetState {
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }
 
+  /// @brief True if the state contains gate tensors (not just initial qubit
+  /// tensors)
+  bool isDirty() const { return m_tensorId > 0; }
   /// @brief Destructor
   ~TensorNetState();
 };

diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
@@ -80,7 +80,7 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "tensornet-mps")
-    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_BACKEND_TENSORNET_MPS)
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32")

diff --git a/unittests/integration/bug67_vqe_then_sample.cpp b/unittests/integration/bug67_vqe_then_sample.cpp
@@ -12,7 +12,7 @@
 #include <cudaq/optimizers.h>
 #include <cudaq/platform.h>
 
-#ifndef CUDAQ_BACKEND_DM
+#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET
 
 CUDAQ_TEST(VqeThenSample, checkBug67) {