From b28105e3e3767d72d1cd42e13377eee02da04404 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Thu, 14 Mar 2024 07:45:34 +0000
Subject: [PATCH 01/18] Prototype multi-ctrl via lib

---
 multi_ctrls.cpp | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 multi_ctrls.cpp
diff --git a/multi_ctrls.cpp b/multi_ctrls.cpp
new file mode 100644
index 0000000000..3e83558ece
--- /dev/null
+++ b/multi_ctrls.cpp
@@ -0,0 +1,96 @@
+#include <cudaq.h>
+namespace cudaq {
+
+void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ {
+  h(c);
+  cx(b, c);
+  t<cudaq::adj>(c);
+  cx(a, c);
+  t(c);
+  cx(b, c);
+  t<cudaq::adj>(c);
+  cx(a, c);
+  t(b);
+  t(c);
+  h(c);
+  cx(a, b);
+  t(a);
+  t<cudaq::adj>(b);
+  cx(a, b);
+}
+
+void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux,
+                     int adjustment) __qpu__ {
+  for (int i = 0; i < ctls.size() - 1; i += 2) {
+    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
+  }
+  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
+    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
+  }
+}
+
+void CollectControls(
+    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
+    cudaq::qview<> aux, int adjustment) __qpu__ {
+  for (int i = 0; i < ctls.size() - 1; i += 2) {
+    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
+  }
+  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
+    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
+  }
+}
+
+void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ {
+  if (ctls.size() % 2 != 0)
+    CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]);
+}
+
+template <size_t V, typename... T>
+decltype(auto) getParameterPackVals(T &&...Args) noexcept {
+  return std::get<V>(std::forward_as_tuple(std::forward<T>(Args)...));
+}
+
+template <typename mod, typename... QubitTy>
+void x(cudaq::qubit& c0, cudaq::qubit& c1, QubitTy &...qubits) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  static constexpr std::size_t qubitCount = sizeof...(qubits) + 2;
+  static constexpr std::size_t numCtrls = qubitCount - 1;
+  static_assert(numCtrls > 1);
+  if constexpr (numCtrls == 2) {
+    CCNOT(c0,
+          c1,
+          getParameterPackVals<0>(qubits...));
+  } else {
+    cudaq::qvector aux(numCtrls - 2);
+    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
+    ctls.pop_back();
+    ctls.emplace_back(c1);
+    ctls.emplace_back(c0);
+    assert(ctls.size() == numCtrls);
+    cudaq::compute_action(
+        [&]() { CollectControls(ctls, aux, 1 - (ctls.size() % 2)); },
+        [&]() {
+          if (ctls.size() % 2 != 0) {
+            CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
+          } else {
+            CCNOT(aux[ctls.size() - 3], aux[ctls.size() - 4], getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
+          }
+        });
+  }
+}
+} // namespace cudaq
+
+int main() {
+
+  auto kernel = []() __qpu__ {
+    cudaq::qarray<5> q;
+    x(q);
+    x<cudaq::ctrl>(q[0], q[1], q[2], q[3], q[4]);
+    mz(q);
+  };
+
+  auto counts = cudaq::sample(kernel);
+  counts.dump();
+
+  return 0;
+}
\ No newline at end of file

From f696fba2aa3d87d44dc180344fcb9304128d953d Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Fri, 15 Mar 2024 02:44:13 +0000
Subject: [PATCH 02/18] Control gate decomp lib

---
 .../kernels/decomposition/controlled_gates.h  | 121 ++++++++++++++++++
 runtime/cudaq/utils/cudaq_utils.h             |   7 +
 2 files changed, 128 insertions(+)
 create mode 100644 runtime/cudaq/kernels/decomposition/controlled_gates.h

diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h
new file mode 100644
index 0000000000..0625032e5a
--- /dev/null
+++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h
@@ -0,0 +1,121 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cudaq.h>
+#include <cudaq/utils/cudaq_utils.h>
+
+namespace cudaq {
+namespace internal {
+
+/// Applies the doubly controlled–NOT (CCNOT) gate to three qubits via
+/// decomposition.
+void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ {
+  h(c);
+  cx(b, c);
+  t<cudaq::adj>(c);
+  cx(a, c);
+  t(c);
+  cx(b, c);
+  t<cudaq::adj>(c);
+  cx(a, c);
+  t(b);
+  t(c);
+  h(c);
+  cx(a, b);
+  t(a);
+  t<cudaq::adj>(b);
+  cx(a, b);
+}
+
+/// Collects the given list of control qubits into one or two of the given
+/// auxiliary qubits, using all but the last qubits in the auxiliary list as
+/// scratch qubits.
+/// For example, if the controls list is 6 qubits, the auxiliary list must be 5
+/// qubits, and the state from the 6 control qubits will be collected into the
+/// last qubit of the auxiliary array.
+/// The adjustment is used to allow the caller to reduce or increase
+/// the number of times this is run based on the eventual number of control
+/// qubits needed.
+void CollectControls(
+    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
+    cudaq::qview<> aux, int adjustment) __qpu__ {
+  for (int i = 0; i < ctls.size() - 1; i += 2) {
+    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
+  }
+  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
+    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
+  }
+}
+
+void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux,
+                     int adjustment) __qpu__ {
+  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
+  for (auto &q : ctls)
+    castedCtls.emplace_back(q);
+
+  CollectControls(castedCtls, aux, adjustment);
+}
+
+/// When collecting controls, if there is an uneven number of original control
+/// qubits then the last control and the second to last auxiliary will be
+/// collected into the last auxiliary.
+void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ {
+  if (ctls.size() % 2 != 0)
+    CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]);
+}
+
+template <typename mod>
+void x(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
+       cudaq::qubit &target) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  const std::size_t numCtrls = ctrls.size();
+  if (numCtrls == 0) {
+    x(target);
+  } else if (numCtrls == 1) {
+    cx(ctrls[0], target);
+  } else if (numCtrls == 2) {
+    CCNOT(ctrls[0], ctrls[1], target);
+  } else {
+    cudaq::qvector aux(numCtrls - 2);
+    cudaq::compute_action(
+        [&]() { CollectControls(ctrls, aux, 1 - (ctrls.size() % 2)); },
+        [&]() {
+          if (ctrls.size() % 2 != 0) {
+            CCNOT(ctrls[ctrls.size() - 1], aux[ctrls.size() - 3], target);
+          } else {
+            CCNOT(aux[ctrls.size() - 3], aux[ctrls.size() - 4], target);
+          }
+        });
+  }
+}
+} // namespace internal
+
+template <typename mod, typename... QubitTy>
+void x(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
+  // Last qubit is the target
+  ctls.pop_back();
+  // Add the two explicit qubits
+  ctls.emplace_back(c1);
+  ctls.emplace_back(c0);
+  internal::x<mod>(
+      ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
+}
+
+template <typename mod>
+void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
+  for (auto &q : ctrls)
+    castedCtls.emplace_back(q);
+  internal::x<mod>(castedCtls, target);
+}
+} // namespace cudaq
diff --git a/runtime/cudaq/utils/cudaq_utils.h b/runtime/cudaq/utils/cudaq_utils.h
index 9e3280a232..16d7d824d4 100644
--- a/runtime/cudaq/utils/cudaq_utils.h
+++ b/runtime/cudaq/utils/cudaq_utils.h
@@ -188,6 +188,13 @@ void tuple_for_each_with_idx(TupleType &&t, FunctionType f) {
                           std::integral_constant<size_t, 0>());
 }
 
+// Utility function to access a parameter at index from a variadic parameter
+// pack.
+template <size_t Idx, typename... T>
+decltype(auto) getParameterPackVals(T &&...Args) noexcept {
+  return std::get<Idx>(std::forward_as_tuple(std::forward<T>(Args)...));
+}
+
 // Function check if file with given path+name exists
 inline bool fileExists(const std::string &name) {
   if (FILE *file = fopen(name.c_str(), "r")) {

From dd23431e54fc2a7fb719d71b3bb6fa453b22fff5 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Fri, 15 Mar 2024 03:36:26 +0000
Subject: [PATCH 03/18] Add z

---
 .../kernels/decomposition/controlled_gates.h  | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h
index 0625032e5a..92bf17d3d7 100644
--- a/runtime/cudaq/kernels/decomposition/controlled_gates.h
+++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h
@@ -66,7 +66,9 @@ void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux,
 /// When collecting controls, if there is an uneven number of original control
 /// qubits then the last control and the second to last auxiliary will be
 /// collected into the last auxiliary.
-void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ {
+void AdjustForSingleControl(
+    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
+    cudaq::qview<> aux) __qpu__ {
   if (ctls.size() % 2 != 0)
     CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]);
 }
@@ -79,7 +81,7 @@ void x(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
   if (numCtrls == 0) {
     x(target);
   } else if (numCtrls == 1) {
-    cx(ctrls[0], target);
+    cx(ctrls[0].get(), target);
   } else if (numCtrls == 2) {
     CCNOT(ctrls[0], ctrls[1], target);
   } else {
@@ -95,6 +97,26 @@ void x(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
         });
   }
 }
+
+template <typename mod>
+void z(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
+       cudaq::qubit &target) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  const std::size_t numCtrls = ctrls.size();
+  if (numCtrls == 0) {
+    z(target);
+  } else if (numCtrls == 1) {
+    z<cudaq::ctrl>(ctrls[0].get(), target);
+  } else {
+    cudaq::qvector aux(numCtrls - 1);
+    cudaq::compute_action(
+        [&]() {
+          CollectControls(ctrls, aux, 0);
+          AdjustForSingleControl(ctrls, aux);
+        },
+        [&]() { z<cudaq::ctrl>(aux[ctrls.size() - 2], target); });
+  }
+}
 } // namespace internal
 
 template <typename mod, typename... QubitTy>
@@ -118,4 +140,26 @@ void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {
     castedCtls.emplace_back(q);
   internal::x<mod>(castedCtls, target);
 }
+
+template <typename mod, typename... QubitTy>
+void z(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
+  // Last qubit is the target
+  ctls.pop_back();
+  // Add the two explicit qubits
+  ctls.emplace_back(c1);
+  ctls.emplace_back(c0);
+  internal::z<mod>(
+      ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
+}
+
+template <typename mod>
+void z(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {
+  static_assert(std::is_same_v<mod, cudaq::ctrl>);
+  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
+  for (auto &q : ctrls)
+    castedCtls.emplace_back(q);
+  internal::z<mod>(castedCtls, target);
+}
 } // namespace cudaq

From 18886a5533fc5f999b40f5c9701bb53f745a350d Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Fri, 15 Mar 2024 05:47:49 +0000
Subject: [PATCH 04/18] Lib mode for all multi-control decomp

---
 .../kernels/decomposition/controlled_gates.h  | 169 ++++++++++++------
 1 file changed, 111 insertions(+), 58 deletions(-)

diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h
index 92bf17d3d7..254eb0822a 100644
--- a/runtime/cudaq/kernels/decomposition/controlled_gates.h
+++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h
@@ -98,68 +98,121 @@ void x(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
   }
 }
 
-template <typename mod>
-void z(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
-       cudaq::qubit &target) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  const std::size_t numCtrls = ctrls.size();
-  if (numCtrls == 0) {
-    z(target);
-  } else if (numCtrls == 1) {
-    z<cudaq::ctrl>(ctrls[0].get(), target);
-  } else {
-    cudaq::qvector aux(numCtrls - 1);
-    cudaq::compute_action(
-        [&]() {
-          CollectControls(ctrls, aux, 0);
-          AdjustForSingleControl(ctrls, aux);
-        },
-        [&]() { z<cudaq::ctrl>(aux[ctrls.size() - 2], target); });
+#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(NAME)                        \
+  template <typename mod>                                                      \
+  void NAME(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,    \
+            cudaq::qubit &target) __qpu__ {                                    \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    const std::size_t numCtrls = ctrls.size();                                 \
+    if (numCtrls == 0) {                                                       \
+      NAME(target);                                                            \
+    } else if (numCtrls == 1) {                                                \
+      NAME<cudaq::ctrl>(ctrls[0].get(), target);                               \
+    } else {                                                                   \
+      cudaq::qvector aux(numCtrls - 1);                                        \
+      cudaq::compute_action(                                                   \
+          [&]() {                                                              \
+            CollectControls(ctrls, aux, 0);                                    \
+            AdjustForSingleControl(ctrls, aux);                                \
+          },                                                                   \
+          [&]() { NAME<cudaq::ctrl>(aux[ctrls.size() - 2], target); });        \
+    }                                                                          \
   }
-}
+
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(h)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(y)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(z)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(t)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(s)
+
+#define CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(NAME)              \
+  template <typename mod, typename ScalarAngle>                                \
+  void NAME(ScalarAngle angle,                                                 \
+            const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,    \
+            cudaq::qubit &target) __qpu__ {                                    \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    const std::size_t numCtrls = ctrls.size();                                 \
+    if (numCtrls == 0) {                                                       \
+      NAME(angle, target);                                                     \
+    } else if (numCtrls == 1) {                                                \
+      oneQubitSingleParameterApply<qubit_op::NAME##Op, mod>(                   \
+          angle, ctrls[0].get(), target);                                      \
+    } else {                                                                   \
+      cudaq::qvector aux(numCtrls - 1);                                        \
+      cudaq::compute_action(                                                   \
+          [&]() {                                                              \
+            CollectControls(ctrls, aux, 0);                                    \
+            AdjustForSingleControl(ctrls, aux);                                \
+          },                                                                   \
+          [&]() {                                                              \
+            oneQubitSingleParameterApply<qubit_op::NAME##Op, mod>(             \
+                angle, aux[ctrls.size() - 2], target);                         \
+          });                                                                  \
+    }                                                                          \
+  }
+
+CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rx)
+CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(ry)
+CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rz)
+CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(r1)
 } // namespace internal
 
-template <typename mod, typename... QubitTy>
-void x(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
-  // Last qubit is the target
-  ctls.pop_back();
-  // Add the two explicit qubits
-  ctls.emplace_back(c1);
-  ctls.emplace_back(c0);
-  internal::x<mod>(
-      ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
-}
+#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(NAME)                             \
+  template <typename mod, typename... QubitTy>                                 \
+  void NAME(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {  \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};       \
+    /* Last qubit is the target */                                             \
+    ctls.pop_back();                                                           \
+    /*Add the two explicit qubits */                                           \
+    ctls.emplace_back(c1);                                                     \
+    ctls.emplace_back(c0);                                                     \
+    internal::NAME<mod>(                                                       \
+        ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));  \
+  }                                                                            \
+  template <typename mod>                                                      \
+  void NAME(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {              \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;              \
+    for (auto &q : ctrls)                                                      \
+      castedCtls.emplace_back(q);                                              \
+    internal::NAME<mod>(castedCtls, target);                                   \
+  }
 
-template <typename mod>
-void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
-  for (auto &q : ctrls)
-    castedCtls.emplace_back(q);
-  internal::x<mod>(castedCtls, target);
-}
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(h)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(x)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(y)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(z)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(t)
+CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(s)
 
-template <typename mod, typename... QubitTy>
-void z(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
-  // Last qubit is the target
-  ctls.pop_back();
-  // Add the two explicit qubits
-  ctls.emplace_back(c1);
-  ctls.emplace_back(c0);
-  internal::z<mod>(
-      ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
-}
+#define CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(NAME)                   \
+  template <typename mod, typename ScalarAngle, typename... QubitTy>           \
+  void NAME(ScalarAngle angle, cudaq::qubit &c0, cudaq::qubit &c1,             \
+            QubitTy &...qubits) __qpu__ {                                      \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    static_assert(sizeof...(qubits) > 0);                                      \
+    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};       \
+    /* Last qubit is the target */                                             \
+    cudaq::qubit &target = ctls.back();                                        \
+    ctls.pop_back();                                                           \
+    /*Add the two explicit qubits */                                           \
+    ctls.emplace_back(c1);                                                     \
+    ctls.emplace_back(c0);                                                     \
+    internal::NAME<mod>(angle, ctls, target);                                  \
+  }                                                                            \
+  template <typename mod, typename ScalarAngle>                                \
+  void NAME(ScalarAngle angle, cudaq::qview<> ctrls, cudaq::qubit &target)     \
+      __qpu__ {                                                                \
+    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
+    std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;              \
+    for (auto &q : ctrls)                                                      \
+      castedCtls.emplace_back(q);                                              \
+    internal::NAME<mod>(angle, castedCtls, target);                            \
+  }
 
-template <typename mod>
-void z(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
-  for (auto &q : ctrls)
-    castedCtls.emplace_back(q);
-  internal::z<mod>(castedCtls, target);
-}
+CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rx)
+CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(ry)
+CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rz)
+CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(r1)
 } // namespace cudaq

From fa0c34a27db8025db021c07c8528b950d6031ca6 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Fri, 15 Mar 2024 06:08:37 +0000
Subject: [PATCH 05/18] Hook library-based decomp to MPS

---
 runtime/cudaq/qis/qubit_qis.h                      |  4 ++++
 runtime/nvqir/cutensornet/CMakeLists.txt           |  4 ++--
 runtime/nvqir/cutensornet/tensornet-mps.config     | 12 ++++++++++++
 runtime/nvqir/cutensornet/tensornet.config         | 11 +++++++++++
 unittests/integration/ccnot_tester.cpp             |  3 ---
 unittests/integration/gate_library_tester.cpp      |  4 +---
 unittests/integration/grover_test.cpp              |  3 ---
 unittests/integration/negative_controls_tester.cpp |  4 +---
 unittests/qis/QubitQISTester.cpp                   |  3 ---
 9 files changed, 31 insertions(+), 17 deletions(-)
 create mode 100644 runtime/nvqir/cutensornet/tensornet-mps.config
 create mode 100644 runtime/nvqir/cutensornet/tensornet.config

diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index 45ddb847b7..559b7d5521 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -633,3 +633,7 @@ std::vector<T> slice_vector(std::vector<T> &original, std::size_t start,
 }
 
 } // namespace cudaq
+
+#if defined(CUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION)
+#include <cudaq/kernels/decomposition/controlled_gates.h>
+#endif
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 23f10564af..22d8e30e8a 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -69,12 +69,12 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
     target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR})
     target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart)
     install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib)
-    file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n")
-    install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets)
   endmacro()
 
   nvqir_create_cutn_plugin(tensornet ${BASE_TENSOR_BACKEND_SRS} simulator_tensornet_register.cpp )
   nvqir_create_cutn_plugin(tensornet-mps ${BASE_TENSOR_BACKEND_SRS} simulator_mps_register.cpp)
+  add_target_config(tensornet)
+  add_target_config(tensornet-mps)
   add_library(tensornet-mpi-util OBJECT mpi_support.cpp)
   target_include_directories(tensornet-mpi-util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/runtime)
   target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only)
diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config
new file mode 100644
index 0000000000..2502f96e31
--- /dev/null
+++ b/runtime/nvqir/cutensornet/tensornet-mps.config
@@ -0,0 +1,12 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+NVQIR_SIMULATION_BACKEND="tensornet-mps"
+TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target"
+GPU_REQUIREMENTS="true"
+COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION"
diff --git a/runtime/nvqir/cutensornet/tensornet.config b/runtime/nvqir/cutensornet/tensornet.config
new file mode 100644
index 0000000000..9f47ba1c88
--- /dev/null
+++ b/runtime/nvqir/cutensornet/tensornet.config
@@ -0,0 +1,11 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+NVQIR_SIMULATION_BACKEND="tensornet"
+TARGET_DESCRIPTION="cuTensorNet-based full tensor network contraction backend target"
+GPU_REQUIREMENTS="true"
diff --git a/unittests/integration/ccnot_tester.cpp b/unittests/integration/ccnot_tester.cpp
index 4d4a4220c8..4c1030fe34 100644
--- a/unittests/integration/ccnot_tester.cpp
+++ b/unittests/integration/ccnot_tester.cpp
@@ -54,8 +54,6 @@ struct nested_ctrl {
   }
 };
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(CCNOTTester, checkSimple) {
   auto ccnot = []() {
     cudaq::qvector q(3);
@@ -96,4 +94,3 @@ CUDAQ_TEST(FredkinTester, checkTruth) {
   EXPECT_EQ(counts.size(), 1);
   EXPECT_EQ(counts.begin()->first, "110");
 }
-#endif
diff --git a/unittests/integration/gate_library_tester.cpp b/unittests/integration/gate_library_tester.cpp
index 78de65f7d5..3d976199f5 100644
--- a/unittests/integration/gate_library_tester.cpp
+++ b/unittests/integration/gate_library_tester.cpp
@@ -88,8 +88,7 @@ CUDAQ_TEST(GateLibraryTester, checkGivensRotationKernelBuilder) {
   }
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
+
 CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) {
   for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) {
     // Same check, with 2 control qubits
@@ -117,7 +116,6 @@ CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) {
     EXPECT_NEAR(std::abs(ss_01_off[0]), 1.0, 1e-6);
   }
 }
-#endif
 
 CUDAQ_TEST(GateLibraryTester, checkFermionicSwap) {
   for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) {
diff --git a/unittests/integration/grover_test.cpp b/unittests/integration/grover_test.cpp
index be4fd3a374..3e64a749c7 100644
--- a/unittests/integration/grover_test.cpp
+++ b/unittests/integration/grover_test.cpp
@@ -48,8 +48,6 @@ struct oracle {
   }
 };
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(GroverTester, checkNISQ) {
   using namespace cudaq;
   auto counts = cudaq::sample(1000, run_grover{}, 3, 1, oracle{});
@@ -62,4 +60,3 @@ CUDAQ_TEST(GroverTester, checkNISQ) {
   }
   EXPECT_EQ(counter, 1000);
 }
-#endif
diff --git a/unittests/integration/negative_controls_tester.cpp b/unittests/integration/negative_controls_tester.cpp
index 6bd13b3404..c46e8ac48c 100644
--- a/unittests/integration/negative_controls_tester.cpp
+++ b/unittests/integration/negative_controls_tester.cpp
@@ -7,8 +7,7 @@
  ******************************************************************************/
 
 #include "CUDAQTestUtils.h"
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
+
 CUDAQ_TEST(NegativeControlsTester, checkSimple) {
 
   auto kernel = []() __qpu__ {
@@ -63,4 +62,3 @@ CUDAQ_TEST(NegativeControlsTester, checkSimple) {
 
   EXPECT_EQ(counter, 1000);
 }
-#endif
diff --git a/unittests/qis/QubitQISTester.cpp b/unittests/qis/QubitQISTester.cpp
index 8dc488d80d..cf92aa507a 100644
--- a/unittests/qis/QubitQISTester.cpp
+++ b/unittests/qis/QubitQISTester.cpp
@@ -157,8 +157,6 @@ CUDAQ_TEST(QubitQISTester, checkCommonKernel) {
   EXPECT_NEAR(energy, -1.7487, 1e-3);
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(QubitQISTester, checkCtrlRegion) {
 
   auto ccnot = []() {
@@ -229,7 +227,6 @@ CUDAQ_TEST(QubitQISTester, checkCtrlRegion) {
   EXPECT_EQ(1, counts3.size());
   EXPECT_TRUE(counts3.begin()->first == "101");
 }
-#endif
 
 CUDAQ_TEST(QubitQISTester, checkAdjointRegions) {
   struct single_adjoint_test {

From 1cd53255cc0b48010267e4500dbb4e52f819b040 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Tue, 19 Mar 2024 08:02:12 +0000
Subject: [PATCH 06/18] Make multi-ctrl decomposition an execution manager
 since we need to intercept control

---
 runtime/nvqir/cutensornet/CMakeLists.txt      |  18 +
 .../nvqir/cutensornet/MpsExecutionManager.cpp | 467 ++++++++++++++++++
 .../nvqir/cutensornet/tensornet-mps.config    |   1 +
 3 files changed, 486 insertions(+)
 create mode 100644 runtime/nvqir/cutensornet/MpsExecutionManager.cpp

diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 22d8e30e8a..9cb46fd27f 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -90,6 +90,24 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
     target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
     target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
   endif()
+
+  add_library(cudaq-em-mps SHARED MpsExecutionManager.cpp)
+  set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-em-mps)
+  target_include_directories(cudaq-em-mps
+      PUBLIC 
+        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
+        $<INSTALL_INTERFACE:include>
+      PRIVATE .)
+
+  target_link_libraries(cudaq-em-mps
+    PUBLIC cudaq-spin PRIVATE nvqir cudaq-common fmt::fmt-header-only LLVMSupport)
+
+  install(TARGETS cudaq-em-mps EXPORT cudaq-em-mps-targets DESTINATION lib)
+
+  install(EXPORT cudaq-em-mps-targets
+          FILE CUDAQEmMpsTargets.cmake
+          NAMESPACE cudaq::
+          DESTINATION lib/cmake/cudaq)
 else()
   message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.")
 endif()
diff --git a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp
new file mode 100644
index 0000000000..8245a0f131
--- /dev/null
+++ b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp
@@ -0,0 +1,467 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "common/Logger.h"
+#include "cudaq/qis/managers/BasicExecutionManager.h"
+#include "nvqir/CircuitSimulator.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace nvqir {
+CircuitSimulator *getCircuitSimulatorInternal();
+}
+namespace {
+class MpsExecutionManager : public cudaq::BasicExecutionManager {
+private:
+  nvqir::CircuitSimulator *simulator() {
+    return nvqir::getCircuitSimulatorInternal();
+  }
+
+  /// @brief To improve `qudit` allocation, we defer
+  /// single `qudit` allocation requests until the first
+  /// encountered `apply` call.
+  std::vector<cudaq::QuditInfo> requestedAllocations;
+  std::vector<cudaq::QuditInfo> auxQuditIdsForDeletion;
+
+  std::vector<Instruction>
+  decomposeMultiControlledInstruction(const Instruction &instruction,
+                                      std::vector<cudaq::QuditInfo> &aux) {
+    // Get the data, create the Qubit* targets
+    auto [gateName, parameters, controls, targets, op] = instruction;
+    if (controls.size() + targets.size() <= 2) {
+      return {instruction};
+    }
+    std::vector<Instruction> decomposedInsts;
+
+    const auto makeInstruction =
+        [](const std::string gateName, const std::vector<double> &gateParams,
+           const std::vector<cudaq::QuditInfo> &ctrls,
+           const std::vector<cudaq::QuditInfo> &targets) -> Instruction {
+      return std::make_tuple(gateName, gateParams, ctrls, targets, cudaq::spin_op{});
+    };
+
+    if (targets.size() > 1) {
+      if (gateName == "swap") {
+        {
+          auto mutableCtrls = controls;
+          mutableCtrls.emplace_back(targets[0]);
+          const auto insts = decomposeMultiControlledInstruction(
+              makeInstruction("x", {}, mutableCtrls, {targets[1]}),
+              auxQuditIdsForDeletion);
+          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
+                                 insts.end());
+        }
+        {
+          auto mutableCtrls = controls;
+          mutableCtrls.emplace_back(targets[1]);
+          const auto insts = decomposeMultiControlledInstruction(
+              makeInstruction("x", {}, mutableCtrls, {targets[0]}),
+              auxQuditIdsForDeletion);
+          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
+                                 insts.end());
+        }
+        {
+          auto mutableCtrls = controls;
+          mutableCtrls.emplace_back(targets[0]);
+          const auto insts = decomposeMultiControlledInstruction(
+              makeInstruction("x", {}, mutableCtrls, {targets[1]}),
+              auxQuditIdsForDeletion);
+          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
+                                 insts.end());
+        }
+        return decomposedInsts;
+      } else if (gateName == "exp_pauli") {
+        if (controls.size() <= 1) {
+          return {instruction};
+        } else {
+          std::vector<cudaq::QuditInfo> qubitSupport;
+          std::vector<std::function<void(bool)>> basisChange;
+          op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
+            if (type != cudaq::pauli::I)
+              qubitSupport.push_back(targets[qubitIdx]);
+
+            if (type == cudaq::pauli::Y)
+              basisChange.emplace_back([&, qubitIdx](bool reverse) {
+                decomposedInsts.emplace_back(
+                    makeInstruction("rx", {!reverse ? M_PI_2 : -M_PI_2}, {},
+                                    {targets[qubitIdx]}));
+              });
+            else if (type == cudaq::pauli::X)
+              basisChange.emplace_back([&, qubitIdx](bool) {
+                decomposedInsts.emplace_back(
+                    makeInstruction("h", {}, {}, {targets[qubitIdx]}));
+              });
+          });
+
+          if (!basisChange.empty())
+            for (auto &basis : basisChange)
+              basis(false);
+
+          std::vector<std::pair<cudaq::QuditInfo, cudaq::QuditInfo>> toReverse;
+          for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
+            decomposedInsts.emplace_back(makeInstruction(
+                "x", {}, {qubitSupport[i]}, {qubitSupport[i + 1]}));
+            toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
+          }
+
+          // Since this is a compute-action-uncompute type circuit, we only need
+          // to apply control on this rz gate.
+          {
+            const auto mcRzInsts = decomposeMultiControlledInstruction(
+                makeInstruction("rz", {-2.0 * parameters[0]}, controls,
+                                {qubitSupport.back()}),
+                auxQuditIdsForDeletion);
+            decomposedInsts.insert(decomposedInsts.end(), mcRzInsts.begin(),
+                                   mcRzInsts.end());
+          }
+
+          std::reverse(toReverse.begin(), toReverse.end());
+          for (auto &[i, j] : toReverse)
+            decomposedInsts.emplace_back(makeInstruction("x", {}, {i}, {j}));
+
+          if (!basisChange.empty()) {
+            std::reverse(basisChange.begin(), basisChange.end());
+            for (auto &basis : basisChange)
+              basis(true);
+          }
+          return decomposedInsts;
+        }
+      } else {
+        throw std::runtime_error("Unsupported: " + gateName);
+      }
+    }
+
+    const auto ccnot = [&](cudaq::QuditInfo &a, cudaq::QuditInfo &b,
+                           cudaq::QuditInfo &c) {
+      decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {b}));
+      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b}));
+      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {a}));
+      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {b}));
+      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b}));
+    };
+
+    const auto collectControls = [&](std::vector<cudaq::QuditInfo> &ctls,
+                                     std::vector<cudaq::QuditInfo> &aux,
+                                     int adjustment) {
+      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2) {
+        ccnot(ctls[i], ctls[i + 1], aux[i / 2]);
+      }
+      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1 - adjustment;
+           ++i) {
+        ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
+      }
+    };
+    const auto adjustForSingleControl =
+        [&](std::vector<cudaq::QuditInfo> &ctls,
+            std::vector<cudaq::QuditInfo> &aux) {
+          if (ctls.size() % 2 != 0)
+            ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3],
+                  aux[ctls.size() - 2]);
+        };
+    for (std::size_t i = aux.size(); i < controls.size() - 1; ++i)
+      aux.emplace_back(cudaq::QuditInfo(2, getAvailableIndex(2)));
+
+    collectControls(controls, aux, 0);
+    adjustForSingleControl(controls, aux);
+    // Add to the instruction queue
+    decomposedInsts.emplace_back(
+        std::move(gateName), parameters,
+        std::vector<cudaq::QuditInfo>{aux[controls.size() - 2]}, targets, op);
+    adjustForSingleControl(controls, aux);
+    collectControls(controls, aux, 0);
+    return decomposedInsts;
+  }
+
+  /// @brief Allocate all requested `qudits`.
+  void flushRequestedAllocations() {
+    if (requestedAllocations.empty())
+      return;
+
+    allocateQudits(requestedAllocations);
+    requestedAllocations.clear();
+  }
+
+protected:
+  void allocateQudit(const cudaq::QuditInfo &q) override {
+    requestedAllocations.emplace_back(2, q.id);
+  }
+
+  void allocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {
+    simulator()->allocateQubits(qudits.size());
+  }
+
+  void deallocateQudit(const cudaq::QuditInfo &q) override {
+
+    // Before trying to deallocate, make sure the qudit hasn't
+    // been requested but not allocated.
+    auto iter =
+        std::find(requestedAllocations.begin(), requestedAllocations.end(), q);
+    if (iter != requestedAllocations.end()) {
+      requestedAllocations.erase(iter);
+      return;
+    }
+
+    simulator()->deallocate(q.id);
+  }
+
+  void deallocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {
+    std::vector<std::size_t> local;
+    for (auto &q : qudits) {
+      auto iter = std::find(requestedAllocations.begin(),
+                            requestedAllocations.end(), q);
+      if (iter != requestedAllocations.end()) {
+        requestedAllocations.erase(iter);
+      } else {
+        local.push_back(q.id);
+      }
+    }
+
+    simulator()->deallocateQubits(local);
+  }
+
+  void handleExecutionContextChanged() override {
+    requestedAllocations.clear();
+    simulator()->setExecutionContext(executionContext);
+  }
+
+  void handleExecutionContextEnded() override {
+    simulator()->resetExecutionContext();
+  }
+
+  void executeInstruction(const Instruction &instruction) override {
+    flushRequestedAllocations();
+
+    // Get the data, create the Qubit* targets
+    auto [gateName, parameters, controls, targets, op] = instruction;
+
+    // Map the Qudits to Qubits
+    std::vector<std::size_t> localT;
+    std::transform(targets.begin(), targets.end(), std::back_inserter(localT),
+                   [](auto &&el) { return el.id; });
+    std::vector<std::size_t> localC;
+    std::transform(controls.begin(), controls.end(), std::back_inserter(localC),
+                   [](auto &&el) { return el.id; });
+
+    // Apply the gate
+    llvm::StringSwitch<std::function<void()>>(gateName)
+        .Case("h", [&]() { simulator()->h(localC, localT[0]); })
+        .Case("x", [&]() { simulator()->x(localC, localT[0]); })
+        .Case("y", [&]() { simulator()->y(localC, localT[0]); })
+        .Case("z", [&]() { simulator()->z(localC, localT[0]); })
+        .Case("rx",
+              [&]() { simulator()->rx(parameters[0], localC, localT[0]); })
+        .Case("ry",
+              [&]() { simulator()->ry(parameters[0], localC, localT[0]); })
+        .Case("rz",
+              [&]() { simulator()->rz(parameters[0], localC, localT[0]); })
+        .Case("s", [&]() { simulator()->s(localC, localT[0]); })
+        .Case("t", [&]() { simulator()->t(localC, localT[0]); })
+        .Case("sdg", [&]() { simulator()->sdg(localC, localT[0]); })
+        .Case("tdg", [&]() { simulator()->tdg(localC, localT[0]); })
+        .Case("r1",
+              [&]() { simulator()->r1(parameters[0], localC, localT[0]); })
+        .Case("u1",
+              [&]() { simulator()->u1(parameters[0], localC, localT[0]); })
+        .Case("u3",
+              [&]() {
+                simulator()->u3(parameters[0], parameters[1], parameters[2],
+                                localC, localT[0]);
+              })
+        .Case("swap",
+              [&]() { simulator()->swap(localC, localT[0], localT[1]); })
+        .Case("exp_pauli",
+              [&]() {
+                simulator()->applyExpPauli(parameters[0], localC, localT, op);
+              })
+        .Default([&]() {
+          throw std::runtime_error("[DefaultExecutionManager] invalid gate "
+                                   "application requested " +
+                                   gateName + ".");
+        })();
+  }
+
+  int measureQudit(const cudaq::QuditInfo &q,
+                   const std::string &registerName) override {
+    flushRequestedAllocations();
+    return simulator()->mz(q.id, registerName);
+  }
+
+  void measureSpinOp(const cudaq::spin_op &op) override {
+    flushRequestedAllocations();
+    simulator()->flushGateQueue();
+
+    if (executionContext->canHandleObserve) {
+      auto result = simulator()->observe(*executionContext->spin.value());
+      executionContext->expectationValue = result.expectationValue;
+      executionContext->result = cudaq::sample_result(result);
+      return;
+    }
+
+    assert(op.num_terms() == 1 && "Number of terms is not 1.");
+
+    cudaq::info("Measure {}", op.to_string(false));
+    std::vector<std::size_t> qubitsToMeasure;
+    std::vector<std::function<void(bool)>> basisChange;
+    op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
+      if (type != cudaq::pauli::I)
+        qubitsToMeasure.push_back(qubitIdx);
+
+      if (type == cudaq::pauli::Y)
+        basisChange.emplace_back([&, qubitIdx](bool reverse) {
+          simulator()->rx(!reverse ? M_PI_2 : -M_PI_2, qubitIdx);
+        });
+      else if (type == cudaq::pauli::X)
+        basisChange.emplace_back(
+            [&, qubitIdx](bool) { simulator()->h(qubitIdx); });
+    });
+
+    // Change basis, flush the queue
+    if (!basisChange.empty()) {
+      for (auto &basis : basisChange)
+        basis(false);
+
+      simulator()->flushGateQueue();
+    }
+
+    // Get whether this is shots-based
+    int shots = 0;
+    if (executionContext->shots > 0)
+      shots = executionContext->shots;
+
+    // Sample and give the data to the context
+    cudaq::ExecutionResult result = simulator()->sample(qubitsToMeasure, shots);
+    executionContext->expectationValue = result.expectationValue;
+    executionContext->result = cudaq::sample_result(result);
+
+    // Restore the state.
+    if (!basisChange.empty()) {
+      std::reverse(basisChange.begin(), basisChange.end());
+      for (auto &basis : basisChange)
+        basis(true);
+
+      simulator()->flushGateQueue();
+    }
+  }
+
+public:
+  MpsExecutionManager() {
+    cudaq::info("[MpsExecutionManager] Creating the {} backend.",
+                simulator()->name());
+  }
+  virtual ~MpsExecutionManager() = default;
+
+  void resetQudit(const cudaq::QuditInfo &q) override {
+    flushRequestedAllocations();
+    simulator()->resetQubit(q.id);
+  }
+
+  void endAdjointRegion() override {
+    assert(!adjointQueueStack.empty() && "There must be at least one queue");
+
+    auto adjointQueue = std::move(adjointQueueStack.back());
+    adjointQueueStack.pop_back();
+
+    // Select the queue to which these instructions will be added.
+    InstructionQueue *queue = adjointQueueStack.empty()
+                                  ? &instructionQueue
+                                  : &(adjointQueueStack.back());
+
+    std::reverse(adjointQueue.begin(), adjointQueue.end());
+    for (auto &instruction : adjointQueue) {
+      const auto insts = decomposeMultiControlledInstruction(
+          instruction, auxQuditIdsForDeletion);
+      queue->insert(queue->end(), insts.begin(), insts.end());
+    }
+  }
+
+  /// The goal for apply is to create a new element of the
+  /// instruction queue (a tuple).
+  void apply(const std::string_view gateName, const std::vector<double> &params,
+             const std::vector<cudaq::QuditInfo> &controls,
+             const std::vector<cudaq::QuditInfo> &targets,
+             bool isAdjoint, cudaq::spin_op op) override {
+
+    // Make a copy of the name that we can mutate if necessary
+    std::string mutable_name(gateName);
+
+    // Make a copy of the parameters that we can mutate
+    std::vector<double> mutable_params = params;
+
+    // Create an array of controls, we will
+    // prepend any extra controls if in a control region
+    std::vector<cudaq::QuditInfo> mutable_controls;
+    for (auto &e : extraControlIds)
+      mutable_controls.emplace_back(2, e);
+
+    for (auto &e : controls)
+      mutable_controls.push_back(e);
+
+    std::vector<cudaq::QuditInfo> mutable_targets;
+    for (auto &t : targets)
+      mutable_targets.push_back(t);
+    // We need to check if we need take the adjoint of the operation. To do this
+    // we use a logical XOR between `isAdjoint` and whether the size of
+    // `adjointQueueStack` is even. The size of `adjointQueueStack` corresponds
+    // to the number of nested `cudaq::adjoint` calls. If the size is even, then
+    // we need to change the operation when `isAdjoint` is true. If the size is
+    // odd, then we need to change the operation when `isAdjoint` is false.
+    // (Adjoint modifiers cancel each other, e.g, `adj adj r1` is `r1`.)
+    //
+    // The cases:
+    //  * not-adjoint, even number of `cudaq::adjoint` => _no_ need to change op
+    //  * not-adjoint, odd number of `cudaq::adjoint`  => change op
+    //  * adjoint,     even number of `cudaq::adjoint` => change op
+    //  * adjoint,     odd number `cudaq::adjoint`     => _no_ need to change op
+    //
+    bool evenAdjointStack = (adjointQueueStack.size() % 2) == 0;
+    if (isAdjoint != !evenAdjointStack) {
+      for (std::size_t i = 0; i < params.size(); i++)
+        mutable_params[i] = -1.0 * params[i];
+      if (gateName == "t")
+        mutable_name = "tdg";
+      else if (gateName == "s")
+        mutable_name = "sdg";
+    }
+
+    if (!adjointQueueStack.empty()) {
+      // Add to the adjoint instruction queue
+      adjointQueueStack.back().emplace_back(
+          mutable_name, mutable_params, mutable_controls, mutable_targets, op);
+      return;
+    }
+
+    const auto insts = decomposeMultiControlledInstruction(
+        {std::move(mutable_name), mutable_params, mutable_controls,
+         mutable_targets, op},
+        auxQuditIdsForDeletion);
+    instructionQueue.insert(instructionQueue.end(), insts.begin(), insts.end());
+  }
+
+  void resetExecutionContext() override {
+    BasicExecutionManager::resetExecutionContext();
+
+    deallocateQudits(auxQuditIdsForDeletion);
+    for (auto &q : auxQuditIdsForDeletion) {
+      returnIndex(q.id);
+    }
+    auxQuditIdsForDeletion.clear();
+  }
+};
+
+} // namespace
+
+CUDAQ_REGISTER_EXECUTION_MANAGER(MpsExecutionManager)
\ No newline at end of file
diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config
index 2502f96e31..040ae0d8bf 100644
--- a/runtime/nvqir/cutensornet/tensornet-mps.config
+++ b/runtime/nvqir/cutensornet/tensornet-mps.config
@@ -10,3 +10,4 @@ NVQIR_SIMULATION_BACKEND="tensornet-mps"
 TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target"
 GPU_REQUIREMENTS="true"
 COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION"
+LIBRARY_MODE_EXECUTION_MANAGER="mps"
\ No newline at end of file

From ec8020481abdea81d7cd276b9dc43c4e753f7e49 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 00:34:42 +0000
Subject: [PATCH 07/18] add decomp to simulator

---
 runtime/cudaq/qis/qubit_qis.h                 |   4 -
 runtime/nvqir/cutensornet/CMakeLists.txt      |  18 -
 .../nvqir/cutensornet/MpsExecutionManager.cpp | 467 ------------------
 .../cutensornet/simulator_mps_register.cpp    | 219 +++++++-
 .../nvqir/cutensornet/tensornet-mps.config    |   2 -
 runtime/nvqir/cutensornet/tensornet_state.cpp |  28 +-
 runtime/nvqir/cutensornet/tensornet_state.h   |   9 +-
 unittests/integration/builder_tester.cpp      |  15 -
 8 files changed, 240 insertions(+), 522 deletions(-)
 delete mode 100644 runtime/nvqir/cutensornet/MpsExecutionManager.cpp

diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index cb2f7caf83..9197720f48 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -636,7 +636,3 @@ std::vector<T> slice_vector(std::vector<T> &original, std::size_t start,
 }
 
 } // namespace cudaq
-
-#if defined(CUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION)
-#include <cudaq/kernels/decomposition/controlled_gates.h>
-#endif
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 9cb46fd27f..22d8e30e8a 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -90,24 +90,6 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
     target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
     target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive)
   endif()
-
-  add_library(cudaq-em-mps SHARED MpsExecutionManager.cpp)
-  set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-em-mps)
-  target_include_directories(cudaq-em-mps
-      PUBLIC 
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
-        $<INSTALL_INTERFACE:include>
-      PRIVATE .)
-
-  target_link_libraries(cudaq-em-mps
-    PUBLIC cudaq-spin PRIVATE nvqir cudaq-common fmt::fmt-header-only LLVMSupport)
-
-  install(TARGETS cudaq-em-mps EXPORT cudaq-em-mps-targets DESTINATION lib)
-
-  install(EXPORT cudaq-em-mps-targets
-          FILE CUDAQEmMpsTargets.cmake
-          NAMESPACE cudaq::
-          DESTINATION lib/cmake/cudaq)
 else()
   message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.")
 endif()
diff --git a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp
deleted file mode 100644
index 8245a0f131..0000000000
--- a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "common/Logger.h"
-#include "cudaq/qis/managers/BasicExecutionManager.h"
-#include "nvqir/CircuitSimulator.h"
-#include "llvm/ADT/StringSwitch.h"
-
-namespace nvqir {
-CircuitSimulator *getCircuitSimulatorInternal();
-}
-namespace {
-class MpsExecutionManager : public cudaq::BasicExecutionManager {
-private:
-  nvqir::CircuitSimulator *simulator() {
-    return nvqir::getCircuitSimulatorInternal();
-  }
-
-  /// @brief To improve `qudit` allocation, we defer
-  /// single `qudit` allocation requests until the first
-  /// encountered `apply` call.
-  std::vector<cudaq::QuditInfo> requestedAllocations;
-  std::vector<cudaq::QuditInfo> auxQuditIdsForDeletion;
-
-  std::vector<Instruction>
-  decomposeMultiControlledInstruction(const Instruction &instruction,
-                                      std::vector<cudaq::QuditInfo> &aux) {
-    // Get the data, create the Qubit* targets
-    auto [gateName, parameters, controls, targets, op] = instruction;
-    if (controls.size() + targets.size() <= 2) {
-      return {instruction};
-    }
-    std::vector<Instruction> decomposedInsts;
-
-    const auto makeInstruction =
-        [](const std::string gateName, const std::vector<double> &gateParams,
-           const std::vector<cudaq::QuditInfo> &ctrls,
-           const std::vector<cudaq::QuditInfo> &targets) -> Instruction {
-      return std::make_tuple(gateName, gateParams, ctrls, targets, cudaq::spin_op{});
-    };
-
-    if (targets.size() > 1) {
-      if (gateName == "swap") {
-        {
-          auto mutableCtrls = controls;
-          mutableCtrls.emplace_back(targets[0]);
-          const auto insts = decomposeMultiControlledInstruction(
-              makeInstruction("x", {}, mutableCtrls, {targets[1]}),
-              auxQuditIdsForDeletion);
-          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
-                                 insts.end());
-        }
-        {
-          auto mutableCtrls = controls;
-          mutableCtrls.emplace_back(targets[1]);
-          const auto insts = decomposeMultiControlledInstruction(
-              makeInstruction("x", {}, mutableCtrls, {targets[0]}),
-              auxQuditIdsForDeletion);
-          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
-                                 insts.end());
-        }
-        {
-          auto mutableCtrls = controls;
-          mutableCtrls.emplace_back(targets[0]);
-          const auto insts = decomposeMultiControlledInstruction(
-              makeInstruction("x", {}, mutableCtrls, {targets[1]}),
-              auxQuditIdsForDeletion);
-          decomposedInsts.insert(decomposedInsts.end(), insts.begin(),
-                                 insts.end());
-        }
-        return decomposedInsts;
-      } else if (gateName == "exp_pauli") {
-        if (controls.size() <= 1) {
-          return {instruction};
-        } else {
-          std::vector<cudaq::QuditInfo> qubitSupport;
-          std::vector<std::function<void(bool)>> basisChange;
-          op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
-            if (type != cudaq::pauli::I)
-              qubitSupport.push_back(targets[qubitIdx]);
-
-            if (type == cudaq::pauli::Y)
-              basisChange.emplace_back([&, qubitIdx](bool reverse) {
-                decomposedInsts.emplace_back(
-                    makeInstruction("rx", {!reverse ? M_PI_2 : -M_PI_2}, {},
-                                    {targets[qubitIdx]}));
-              });
-            else if (type == cudaq::pauli::X)
-              basisChange.emplace_back([&, qubitIdx](bool) {
-                decomposedInsts.emplace_back(
-                    makeInstruction("h", {}, {}, {targets[qubitIdx]}));
-              });
-          });
-
-          if (!basisChange.empty())
-            for (auto &basis : basisChange)
-              basis(false);
-
-          std::vector<std::pair<cudaq::QuditInfo, cudaq::QuditInfo>> toReverse;
-          for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
-            decomposedInsts.emplace_back(makeInstruction(
-                "x", {}, {qubitSupport[i]}, {qubitSupport[i + 1]}));
-            toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
-          }
-
-          // Since this is a compute-action-uncompute type circuit, we only need
-          // to apply control on this rz gate.
-          {
-            const auto mcRzInsts = decomposeMultiControlledInstruction(
-                makeInstruction("rz", {-2.0 * parameters[0]}, controls,
-                                {qubitSupport.back()}),
-                auxQuditIdsForDeletion);
-            decomposedInsts.insert(decomposedInsts.end(), mcRzInsts.begin(),
-                                   mcRzInsts.end());
-          }
-
-          std::reverse(toReverse.begin(), toReverse.end());
-          for (auto &[i, j] : toReverse)
-            decomposedInsts.emplace_back(makeInstruction("x", {}, {i}, {j}));
-
-          if (!basisChange.empty()) {
-            std::reverse(basisChange.begin(), basisChange.end());
-            for (auto &basis : basisChange)
-              basis(true);
-          }
-          return decomposedInsts;
-        }
-      } else {
-        throw std::runtime_error("Unsupported: " + gateName);
-      }
-    }
-
-    const auto ccnot = [&](cudaq::QuditInfo &a, cudaq::QuditInfo &b,
-                           cudaq::QuditInfo &c) {
-      decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {b}));
-      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b}));
-      decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {a}));
-      decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {b}));
-      decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b}));
-    };
-
-    const auto collectControls = [&](std::vector<cudaq::QuditInfo> &ctls,
-                                     std::vector<cudaq::QuditInfo> &aux,
-                                     int adjustment) {
-      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2) {
-        ccnot(ctls[i], ctls[i + 1], aux[i / 2]);
-      }
-      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1 - adjustment;
-           ++i) {
-        ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
-      }
-    };
-    const auto adjustForSingleControl =
-        [&](std::vector<cudaq::QuditInfo> &ctls,
-            std::vector<cudaq::QuditInfo> &aux) {
-          if (ctls.size() % 2 != 0)
-            ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3],
-                  aux[ctls.size() - 2]);
-        };
-    for (std::size_t i = aux.size(); i < controls.size() - 1; ++i)
-      aux.emplace_back(cudaq::QuditInfo(2, getAvailableIndex(2)));
-
-    collectControls(controls, aux, 0);
-    adjustForSingleControl(controls, aux);
-    // Add to the instruction queue
-    decomposedInsts.emplace_back(
-        std::move(gateName), parameters,
-        std::vector<cudaq::QuditInfo>{aux[controls.size() - 2]}, targets, op);
-    adjustForSingleControl(controls, aux);
-    collectControls(controls, aux, 0);
-    return decomposedInsts;
-  }
-
-  /// @brief Allocate all requested `qudits`.
-  void flushRequestedAllocations() {
-    if (requestedAllocations.empty())
-      return;
-
-    allocateQudits(requestedAllocations);
-    requestedAllocations.clear();
-  }
-
-protected:
-  void allocateQudit(const cudaq::QuditInfo &q) override {
-    requestedAllocations.emplace_back(2, q.id);
-  }
-
-  void allocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {
-    simulator()->allocateQubits(qudits.size());
-  }
-
-  void deallocateQudit(const cudaq::QuditInfo &q) override {
-
-    // Before trying to deallocate, make sure the qudit hasn't
-    // been requested but not allocated.
-    auto iter =
-        std::find(requestedAllocations.begin(), requestedAllocations.end(), q);
-    if (iter != requestedAllocations.end()) {
-      requestedAllocations.erase(iter);
-      return;
-    }
-
-    simulator()->deallocate(q.id);
-  }
-
-  void deallocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {
-    std::vector<std::size_t> local;
-    for (auto &q : qudits) {
-      auto iter = std::find(requestedAllocations.begin(),
-                            requestedAllocations.end(), q);
-      if (iter != requestedAllocations.end()) {
-        requestedAllocations.erase(iter);
-      } else {
-        local.push_back(q.id);
-      }
-    }
-
-    simulator()->deallocateQubits(local);
-  }
-
-  void handleExecutionContextChanged() override {
-    requestedAllocations.clear();
-    simulator()->setExecutionContext(executionContext);
-  }
-
-  void handleExecutionContextEnded() override {
-    simulator()->resetExecutionContext();
-  }
-
-  void executeInstruction(const Instruction &instruction) override {
-    flushRequestedAllocations();
-
-    // Get the data, create the Qubit* targets
-    auto [gateName, parameters, controls, targets, op] = instruction;
-
-    // Map the Qudits to Qubits
-    std::vector<std::size_t> localT;
-    std::transform(targets.begin(), targets.end(), std::back_inserter(localT),
-                   [](auto &&el) { return el.id; });
-    std::vector<std::size_t> localC;
-    std::transform(controls.begin(), controls.end(), std::back_inserter(localC),
-                   [](auto &&el) { return el.id; });
-
-    // Apply the gate
-    llvm::StringSwitch<std::function<void()>>(gateName)
-        .Case("h", [&]() { simulator()->h(localC, localT[0]); })
-        .Case("x", [&]() { simulator()->x(localC, localT[0]); })
-        .Case("y", [&]() { simulator()->y(localC, localT[0]); })
-        .Case("z", [&]() { simulator()->z(localC, localT[0]); })
-        .Case("rx",
-              [&]() { simulator()->rx(parameters[0], localC, localT[0]); })
-        .Case("ry",
-              [&]() { simulator()->ry(parameters[0], localC, localT[0]); })
-        .Case("rz",
-              [&]() { simulator()->rz(parameters[0], localC, localT[0]); })
-        .Case("s", [&]() { simulator()->s(localC, localT[0]); })
-        .Case("t", [&]() { simulator()->t(localC, localT[0]); })
-        .Case("sdg", [&]() { simulator()->sdg(localC, localT[0]); })
-        .Case("tdg", [&]() { simulator()->tdg(localC, localT[0]); })
-        .Case("r1",
-              [&]() { simulator()->r1(parameters[0], localC, localT[0]); })
-        .Case("u1",
-              [&]() { simulator()->u1(parameters[0], localC, localT[0]); })
-        .Case("u3",
-              [&]() {
-                simulator()->u3(parameters[0], parameters[1], parameters[2],
-                                localC, localT[0]);
-              })
-        .Case("swap",
-              [&]() { simulator()->swap(localC, localT[0], localT[1]); })
-        .Case("exp_pauli",
-              [&]() {
-                simulator()->applyExpPauli(parameters[0], localC, localT, op);
-              })
-        .Default([&]() {
-          throw std::runtime_error("[DefaultExecutionManager] invalid gate "
-                                   "application requested " +
-                                   gateName + ".");
-        })();
-  }
-
-  int measureQudit(const cudaq::QuditInfo &q,
-                   const std::string &registerName) override {
-    flushRequestedAllocations();
-    return simulator()->mz(q.id, registerName);
-  }
-
-  void measureSpinOp(const cudaq::spin_op &op) override {
-    flushRequestedAllocations();
-    simulator()->flushGateQueue();
-
-    if (executionContext->canHandleObserve) {
-      auto result = simulator()->observe(*executionContext->spin.value());
-      executionContext->expectationValue = result.expectationValue;
-      executionContext->result = cudaq::sample_result(result);
-      return;
-    }
-
-    assert(op.num_terms() == 1 && "Number of terms is not 1.");
-
-    cudaq::info("Measure {}", op.to_string(false));
-    std::vector<std::size_t> qubitsToMeasure;
-    std::vector<std::function<void(bool)>> basisChange;
-    op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
-      if (type != cudaq::pauli::I)
-        qubitsToMeasure.push_back(qubitIdx);
-
-      if (type == cudaq::pauli::Y)
-        basisChange.emplace_back([&, qubitIdx](bool reverse) {
-          simulator()->rx(!reverse ? M_PI_2 : -M_PI_2, qubitIdx);
-        });
-      else if (type == cudaq::pauli::X)
-        basisChange.emplace_back(
-            [&, qubitIdx](bool) { simulator()->h(qubitIdx); });
-    });
-
-    // Change basis, flush the queue
-    if (!basisChange.empty()) {
-      for (auto &basis : basisChange)
-        basis(false);
-
-      simulator()->flushGateQueue();
-    }
-
-    // Get whether this is shots-based
-    int shots = 0;
-    if (executionContext->shots > 0)
-      shots = executionContext->shots;
-
-    // Sample and give the data to the context
-    cudaq::ExecutionResult result = simulator()->sample(qubitsToMeasure, shots);
-    executionContext->expectationValue = result.expectationValue;
-    executionContext->result = cudaq::sample_result(result);
-
-    // Restore the state.
-    if (!basisChange.empty()) {
-      std::reverse(basisChange.begin(), basisChange.end());
-      for (auto &basis : basisChange)
-        basis(true);
-
-      simulator()->flushGateQueue();
-    }
-  }
-
-public:
-  MpsExecutionManager() {
-    cudaq::info("[MpsExecutionManager] Creating the {} backend.",
-                simulator()->name());
-  }
-  virtual ~MpsExecutionManager() = default;
-
-  void resetQudit(const cudaq::QuditInfo &q) override {
-    flushRequestedAllocations();
-    simulator()->resetQubit(q.id);
-  }
-
-  void endAdjointRegion() override {
-    assert(!adjointQueueStack.empty() && "There must be at least one queue");
-
-    auto adjointQueue = std::move(adjointQueueStack.back());
-    adjointQueueStack.pop_back();
-
-    // Select the queue to which these instructions will be added.
-    InstructionQueue *queue = adjointQueueStack.empty()
-                                  ? &instructionQueue
-                                  : &(adjointQueueStack.back());
-
-    std::reverse(adjointQueue.begin(), adjointQueue.end());
-    for (auto &instruction : adjointQueue) {
-      const auto insts = decomposeMultiControlledInstruction(
-          instruction, auxQuditIdsForDeletion);
-      queue->insert(queue->end(), insts.begin(), insts.end());
-    }
-  }
-
-  /// The goal for apply is to create a new element of the
-  /// instruction queue (a tuple).
-  void apply(const std::string_view gateName, const std::vector<double> &params,
-             const std::vector<cudaq::QuditInfo> &controls,
-             const std::vector<cudaq::QuditInfo> &targets,
-             bool isAdjoint, cudaq::spin_op op) override {
-
-    // Make a copy of the name that we can mutate if necessary
-    std::string mutable_name(gateName);
-
-    // Make a copy of the parameters that we can mutate
-    std::vector<double> mutable_params = params;
-
-    // Create an array of controls, we will
-    // prepend any extra controls if in a control region
-    std::vector<cudaq::QuditInfo> mutable_controls;
-    for (auto &e : extraControlIds)
-      mutable_controls.emplace_back(2, e);
-
-    for (auto &e : controls)
-      mutable_controls.push_back(e);
-
-    std::vector<cudaq::QuditInfo> mutable_targets;
-    for (auto &t : targets)
-      mutable_targets.push_back(t);
-    // We need to check if we need take the adjoint of the operation. To do this
-    // we use a logical XOR between `isAdjoint` and whether the size of
-    // `adjointQueueStack` is even. The size of `adjointQueueStack` corresponds
-    // to the number of nested `cudaq::adjoint` calls. If the size is even, then
-    // we need to change the operation when `isAdjoint` is true. If the size is
-    // odd, then we need to change the operation when `isAdjoint` is false.
-    // (Adjoint modifiers cancel each other, e.g, `adj adj r1` is `r1`.)
-    //
-    // The cases:
-    //  * not-adjoint, even number of `cudaq::adjoint` => _no_ need to change op
-    //  * not-adjoint, odd number of `cudaq::adjoint`  => change op
-    //  * adjoint,     even number of `cudaq::adjoint` => change op
-    //  * adjoint,     odd number `cudaq::adjoint`     => _no_ need to change op
-    //
-    bool evenAdjointStack = (adjointQueueStack.size() % 2) == 0;
-    if (isAdjoint != !evenAdjointStack) {
-      for (std::size_t i = 0; i < params.size(); i++)
-        mutable_params[i] = -1.0 * params[i];
-      if (gateName == "t")
-        mutable_name = "tdg";
-      else if (gateName == "s")
-        mutable_name = "sdg";
-    }
-
-    if (!adjointQueueStack.empty()) {
-      // Add to the adjoint instruction queue
-      adjointQueueStack.back().emplace_back(
-          mutable_name, mutable_params, mutable_controls, mutable_targets, op);
-      return;
-    }
-
-    const auto insts = decomposeMultiControlledInstruction(
-        {std::move(mutable_name), mutable_params, mutable_controls,
-         mutable_targets, op},
-        auxQuditIdsForDeletion);
-    instructionQueue.insert(instructionQueue.end(), insts.begin(), insts.end());
-  }
-
-  void resetExecutionContext() override {
-    BasicExecutionManager::resetExecutionContext();
-
-    deallocateQudits(auxQuditIdsForDeletion);
-    for (auto &q : auxQuditIdsForDeletion) {
-      returnIndex(q.id);
-    }
-    auxQuditIdsForDeletion.clear();
-  }
-};
-
-} // namespace
-
-CUDAQ_REGISTER_EXECUTION_MANAGER(MpsExecutionManager)
\ No newline at end of file
diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index b849b7a119..ad53cdfb3c 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -18,7 +18,7 @@ class SimulatorMPS : public SimulatorTensorNetBase {
   // Default relative cutoff
   double m_relCutoff = 1e-5;
   std::vector<void *> m_mpsTensors_d;
-
+  std::vector<std::size_t> m_auxQubitsForGateDecomp;
 public:
   SimulatorMPS() : SimulatorTensorNetBase() {
     if (auto *maxBondEnvVar = std::getenv("CUDAQ_MPS_MAX_BOND")) {
@@ -108,6 +108,223 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     }
     m_mpsTensors_d.clear();
   }
+
+  void resetExecutionContext() override {
+    SimulatorTensorNetBase::resetExecutionContext();
+    m_auxQubitsForGateDecomp.clear();
+  }
+
+  /// @brief Return the state vector data
+  cudaq::State getStateData() override {
+    LOG_API_TIME();
+    if (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size() > 64)
+      throw std::runtime_error("State vector data is too large.");
+    // Handle empty state (e.g., no qubit allocation)
+    if (!m_state)
+      return cudaq::State{{0}, {}};
+    const uint64_t svDim =
+        (1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size()));
+    const std::vector<int32_t> projectedModes(m_auxQubitsForGateDecomp.begin(),
+                                              m_auxQubitsForGateDecomp.end());
+    return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)};
+  }
+
+  size_t addAuxQubit() {
+    if (m_state->isDirty())
+      throw std::runtime_error(
+          "[MPS Simulator] Unable to perform multi-control gate decomposition "
+          "due to dynamical circuits.");
+    m_state = std::make_unique<TensorNetState>(m_state->getNumQubits() + 1,
+                                               m_cutnHandle);
+    return m_state->getNumQubits() - 1;
+  }
+
+  template <typename QuantumOperation>
+  void
+  decomposeMultiControlledInstruction(const std::vector<double> &params,
+                                      const std::vector<std::size_t> &controls,
+                                      const std::vector<std::size_t> &targets) {
+    if (controls.size() <= 1) {
+      enqueueQuantumOperation<QuantumOperation>(params, controls, targets);
+      return;
+    }
+
+    const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) {
+      enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {b});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
+      enqueueQuantumOperation<nvqir::t<double>>({}, {}, {a});
+      enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {b});
+      enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
+    };
+
+    const auto collectControls = [&](const std::vector<std::size_t> &ctls,
+                                     const std::vector<std::size_t> &aux,
+                                     int adjustment) {
+      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2) {
+        ccnot(ctls[i], ctls[i + 1], aux[i / 2]);
+      }
+      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1 - adjustment;
+           ++i) {
+        ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
+      }
+    };
+    const auto adjustForSingleControl =
+        [&](const std::vector<std::size_t> &ctls,
+            const std::vector<std::size_t> &aux) {
+          if (ctls.size() % 2 != 0)
+            ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3],
+                  aux[ctls.size() - 2]);
+        };
+
+    std::vector<std::size_t> aux;
+    for (std::size_t i = 0; i < controls.size() - 1; ++i) {
+      const auto auxQubit = addAuxQubit();
+      m_auxQubitsForGateDecomp.emplace_back(auxQubit);
+      aux.emplace_back(auxQubit);
+    }
+
+    collectControls(controls, aux, 0);
+    adjustForSingleControl(controls, aux);
+    
+    // Add to the singly-controlled instruction queue
+    enqueueQuantumOperation<QuantumOperation>(params, {aux[controls.size() - 2]}, targets);
+   
+    adjustForSingleControl(controls, aux);
+    collectControls(controls, aux, 0);
+  }
+
+#define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME)                                      \
+  using CircuitSimulator::NAME;                                                \
+  void NAME(const std::vector<std::size_t> &controls,                          \
+            const std::size_t qubitIdx) override {                             \
+    decomposeMultiControlledInstruction<nvqir::NAME<double>>(                  \
+        {}, controls, std::vector<std::size_t>{qubitIdx});                     \
+  }
+
+#define CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(NAME)                            \
+  using CircuitSimulator::NAME;                                                \
+  void NAME(const double angle, const std::vector<std::size_t> &controls,      \
+            const std::size_t qubitIdx) override {                             \
+    decomposeMultiControlledInstruction<nvqir::NAME<double>>(                  \
+        {angle}, controls, std::vector<std::size_t>{qubitIdx});                \
+  }
+
+  /// @brief The X gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(x)
+  /// @brief The Y gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(y)
+  /// @brief The Z gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(z)
+  /// @brief The H gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(h)
+  /// @brief The S gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(s)
+  /// @brief The T gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(t)
+  /// @brief The Sdg gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(sdg)
+  /// @brief The Tdg gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT(tdg)
+  /// @brief The RX gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rx)
+  /// @brief The RY gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(ry)
+  /// @brief The RZ gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rz)
+  /// @brief The Phase gate
+  CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(r1)
+// Undef those preprocessor defines.
+#undef CIRCUIT_SIMULATOR_ONE_QUBIT
+#undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM
+
+  using CircuitSimulator::swap;
+  void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
+            const std::size_t tgtIdx) override {
+    if (ctrlBits.empty())
+      return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx);
+    {
+      std::vector<std::size_t> ctls = ctrlBits;
+      ctls.emplace_back(tgtIdx);
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
+    }
+    {
+      std::vector<std::size_t> ctls = ctrlBits;
+      ctls.emplace_back(srcIdx);
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {tgtIdx});
+    }
+    {
+      std::vector<std::size_t> ctls = ctrlBits;
+      ctls.emplace_back(tgtIdx);
+      decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
+    }
+  }
+ 
+  void applyExpPauli(double theta,
+                             const std::vector<std::size_t> &controls,
+                             const std::vector<std::size_t> &qubitIds,
+                             const cudaq::spin_op &op) override {
+    if (op.is_identity()) {
+      if (controls.empty()) {
+        // exp(i*theta*Id) is noop if this is not a controlled gate.
+        return;
+      } else {
+        // Throw an error if this exp_pauli(i*theta*Id) becomes a non-trivial
+        // gate due to control qubits.
+        // FIXME: revisit this once
+        // https://github.com/NVIDIA/cuda-quantum/issues/483 is implemented.
+        throw std::logic_error("Applying controlled global phase via exp_pauli "
+                               "of identity operator is not supported");
+      }
+    }
+    std::vector<std::size_t> qubitSupport;
+    std::vector<std::function<void(bool)>> basisChange;
+    op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
+      if (type != cudaq::pauli::I)
+        qubitSupport.push_back(qubitIds[qubitIdx]);
+
+      if (type == cudaq::pauli::Y)
+        basisChange.emplace_back([&, qubitIdx](bool reverse) {
+          rx(!reverse ? M_PI_2 : -M_PI_2, qubitIds[qubitIdx]);
+        });
+      else if (type == cudaq::pauli::X)
+        basisChange.emplace_back(
+            [&, qubitIdx](bool) { h(qubitIds[qubitIdx]); });
+    });
+
+    if (!basisChange.empty())
+      for (auto &basis : basisChange)
+        basis(false);
+
+    std::vector<std::pair<std::size_t, std::size_t>> toReverse;
+    for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
+      x({qubitSupport[i]}, qubitSupport[i + 1]);
+      toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
+    }
+
+    // Perform multi-control decomposition.
+    decomposeMultiControlledInstruction<nvqir::rz<double>>(
+        {-2.0 * theta}, controls, {qubitSupport.back()});
+
+    std::reverse(toReverse.begin(), toReverse.end());
+    for (auto &[i, j] : toReverse)
+      x({i}, j);
+
+    if (!basisChange.empty()) {
+      std::reverse(basisChange.begin(), basisChange.end());
+      for (auto &basis : basisChange)
+        basis(true);
+    }
+  }
 };
 } // end namespace nvqir
 
diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config
index 040ae0d8bf..27ac2279b2 100644
--- a/runtime/nvqir/cutensornet/tensornet-mps.config
+++ b/runtime/nvqir/cutensornet/tensornet-mps.config
@@ -9,5 +9,3 @@
 NVQIR_SIMULATION_BACKEND="tensornet-mps"
 TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target"
 GPU_REQUIREMENTS="true"
-COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION"
-LIBRARY_MODE_EXECUTION_MANAGER="mps"
\ No newline at end of file
diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
index d869801a46..ecf455e819 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.cpp
+++ b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -23,19 +23,17 @@ TensorNetState::TensorNetState(std::size_t numQubits,
 void TensorNetState::applyGate(const std::vector<int32_t> &qubitIds,
                                void *gateDeviceMem, bool adjoint) {
 
-  int64_t id = 0;
   HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(
       m_cutnHandle, m_quantumState, qubitIds.size(), qubitIds.data(),
       gateDeviceMem, nullptr, /*immutable*/ 1,
-      /*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &id));
+      /*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &m_tensorId));
 }
 
 void TensorNetState::applyQubitProjector(void *proj_d, int32_t qubitIdx) {
-  int64_t id = 0;
   HANDLE_CUTN_ERROR(
       cutensornetStateApplyTensor(m_cutnHandle, m_quantumState, 1, &qubitIdx,
                                   proj_d, nullptr, /*immutable*/ 1,
-                                  /*adjoint*/ 0, /*unitary*/ 0, &id));
+                                  /*adjoint*/ 0, /*unitary*/ 0, &m_tensorId));
 }
 
 std::unordered_map<std::string, size_t>
@@ -120,24 +118,26 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
   return counts;
 }
 
-std::vector<std::complex<double>> TensorNetState::getStateVector() {
+std::vector<std::complex<double>>
+TensorNetState::getStateVector(const std::vector<int32_t> &projectedModes) {
   // Make sure that we don't overflow the memory size calculation.
   // Note: the actual limitation will depend on the system memory.
-  if (m_numQubits > 64 ||
-      (1ull << m_numQubits) >
+  if ((m_numQubits - projectedModes.size()) > 64 ||
+      (1ull << (m_numQubits - projectedModes.size())) >
           std::numeric_limits<uint64_t>::max() / sizeof(std::complex<double>))
     throw std::runtime_error(
         "Too many qubits are requested for full state vector contraction.");
   LOG_API_TIME();
   void *d_sv{nullptr};
-  const uint64_t svDim = 1ull << m_numQubits;
+  const uint64_t svDim = 1ull << (m_numQubits - projectedModes.size());
   HANDLE_CUDA_ERROR(cudaMalloc(&d_sv, svDim * sizeof(std::complex<double>)));
   ScratchDeviceMem scratchPad;
 
   // Create the quantum state amplitudes accessor
   cutensornetStateAccessor_t accessor;
-  HANDLE_CUTN_ERROR(cutensornetCreateAccessor(m_cutnHandle, m_quantumState, 0,
-                                              nullptr, nullptr, &accessor));
+  HANDLE_CUTN_ERROR(cutensornetCreateAccessor(
+      m_cutnHandle, m_quantumState, projectedModes.size(),
+      projectedModes.data(), nullptr, &accessor));
 
   const int32_t numHyperSamples =
       8; // desired number of hyper samples used in the tensor network
@@ -167,9 +167,11 @@ std::vector<std::complex<double>> TensorNetState::getStateVector() {
 
   // Compute the quantum state amplitudes
   std::complex<double> stateNorm{0.0, 0.0};
-  HANDLE_CUTN_ERROR(
-      cutensornetAccessorCompute(m_cutnHandle, accessor, nullptr, workDesc,
-                                 d_sv, static_cast<void *>(&stateNorm), 0));
+  // All projected modes are assumed to be projected to 0.
+  std::vector<int64_t> projectedModeValues(projectedModes.size(), 0);
+  HANDLE_CUTN_ERROR(cutensornetAccessorCompute(
+      m_cutnHandle, accessor, projectedModeValues.data(), workDesc, d_sv,
+      static_cast<void *>(&stateNorm), 0));
   std::vector<std::complex<double>> h_sv(svDim);
   HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv,
                                svDim * sizeof(std::complex<double>),
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index bd38659d1c..7416b3807b 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -19,6 +19,7 @@ class TensorNetState {
   std::size_t m_numQubits;
   cutensornetHandle_t m_cutnHandle;
   cutensornetState_t m_quantumState;
+  int64_t m_tensorId = -1;
 
 public:
   /// @brief Constructor
@@ -45,7 +46,8 @@ class TensorNetState {
 
   /// @brief Contract the tensor network representation to retrieve the state
   /// vector.
-  std::vector<std::complex<double>> getStateVector();
+  std::vector<std::complex<double>>
+  getStateVector(const std::vector<int32_t> &projectedModes = {});
 
   /// @brief Compute the reduce density matrix on a set of qubits
   ///
@@ -73,7 +75,10 @@ class TensorNetState {
 
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }
-
+  
+  /// @brief True if the state contains gate tensors (not just initial qubit
+  /// tensors)
+  bool isDirty() const { return m_tensorId > 0; }
   /// @brief Destructor
   ~TensorNetState();
 };
diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index 7d27635ef8..abdac9b870 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -125,7 +125,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
     EXPECT_EQ(counter, 1000);
   }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
   // MPS doesn't support gates on more than 2 qubits
   {
     auto ccnot_builder = cudaq::make_kernel();
@@ -139,7 +138,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
     counts.dump();
     EXPECT_TRUE(counts.begin()->first == "101");
   }
-#endif
 
   {
     // Check controlled parametric gates (constant angle)
@@ -221,8 +219,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
   }
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(BuilderTester, checkRotations) {
 
   // rx: entire qvector
@@ -427,11 +423,7 @@ CUDAQ_TEST(BuilderTester, checkRotations) {
     EXPECT_EQ(counts.count("0111"), 1000);
   }
 }
-#endif
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// Skip, else fails with error - '"MPS simulator: Gates on 3 or more qubits are
-// unsupported. Encountered: swap[0][1,2]" thrown in the test body.'
 CUDAQ_TEST(BuilderTester, checkSwap) {
   cudaq::set_random_seed(13);
 
@@ -575,7 +567,6 @@ CUDAQ_TEST(BuilderTester, checkSwap) {
     EXPECT_NEAR(counts.count(want_state), 1000, 0);
   }
 }
-#endif
 
 // Conditional execution on the tensornet backend is slow for a large number of
 // shots.
@@ -708,8 +699,6 @@ CUDAQ_TEST(BuilderTester, checkIsArgStdVec) {
   EXPECT_FALSE(kernel.isArgStdVec(1));
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(BuilderTester, checkKernelControl) {
   cudaq::set_random_seed(13);
 
@@ -766,7 +755,6 @@ CUDAQ_TEST(BuilderTester, checkKernelControl) {
   EXPECT_EQ(1, counts.size());
   EXPECT_TRUE(counts.begin()->first == "101");
 }
-#endif
 
 CUDAQ_TEST(BuilderTester, checkAdjointOp) {
   auto kernel = cudaq::make_kernel();
@@ -1101,8 +1089,6 @@ CUDAQ_TEST(BuilderTester, checkExpPauli) {
   }
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits
 CUDAQ_TEST(BuilderTester, checkControlledRotations) {
   // rx: pi
   {
@@ -1217,7 +1203,6 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) {
     EXPECT_EQ(counts.count("11111111"), 1000);
   }
 }
-#endif
 
 #ifndef CUDAQ_BACKEND_DM
 

From a9a4fcf012bb440368420368e4e899f6cb66dc3d Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 00:44:14 +0000
Subject: [PATCH 08/18] Remove the remaining bypassed tests

---
 unittests/CMakeLists.txt      | 2 +-
 unittests/qir/NVQIRTester.cpp | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index b9a01b26d1..50b976f337 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -80,7 +80,7 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "tensornet-mps")
-    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_BACKEND_TENSORNET_MPS)
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32")
diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index f848786f2e..f287877573 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -113,8 +113,6 @@ CUDAQ_TEST(NVQIRTester, checkSimple) {
   __quantum__rt__finalize();
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits (controlled swap)
 CUDAQ_TEST(NVQIRTester, checkQuantumIntrinsics) {
   __quantum__rt__initialize(0, nullptr);
   auto qubits = __quantum__rt__qubit_allocate_array(3);
@@ -153,7 +151,6 @@ CUDAQ_TEST(NVQIRTester, checkQuantumIntrinsics) {
   __quantum__rt__qubit_release_array(qubits);
   __quantum__rt__finalize();
 }
-#endif
 
 CUDAQ_TEST(NVQIRTester, checkReset) {
   __quantum__rt__initialize(0, nullptr);
@@ -175,8 +172,6 @@ CUDAQ_TEST(NVQIRTester, checkReset) {
   __quantum__rt__finalize();
 }
 
-#ifndef CUDAQ_BACKEND_TENSORNET_MPS
-// MPS doesn't support gates on more than 2 qubits (controlled swap)
 // SWAP with a single ctrl qubit in 0 state.
 CUDAQ_TEST(NVQIRTester, checkSWAP) {
   // Simple SWAP.
@@ -261,7 +256,6 @@ CUDAQ_TEST(NVQIRTester, checkSWAP) {
     __quantum__rt__finalize();
   }
 }
-#endif
 
 CUDAQ_TEST(NVQIRTester, checkQubitReset) {
   // Initialize two qubits in the 0-state.

From ea7b38901177bf2cada5ea3470e5f6fa08f861aa Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 00:49:45 +0000
Subject: [PATCH 09/18] Clean up debug changes

---
 multi_ctrls.cpp                               |  96 --------
 .../kernels/decomposition/controlled_gates.h  | 218 ------------------
 runtime/cudaq/utils/cudaq_utils.h             |   7 -
 runtime/nvqir/cutensornet/CMakeLists.txt      |   4 +-
 .../nvqir/cutensornet/tensornet-mps.config    |  11 -
 runtime/nvqir/cutensornet/tensornet.config    |  11 -
 unittests/integration/gate_library_tester.cpp |   1 -
 7 files changed, 2 insertions(+), 346 deletions(-)
 delete mode 100644 multi_ctrls.cpp
 delete mode 100644 runtime/cudaq/kernels/decomposition/controlled_gates.h
 delete mode 100644 runtime/nvqir/cutensornet/tensornet-mps.config
 delete mode 100644 runtime/nvqir/cutensornet/tensornet.config

diff --git a/multi_ctrls.cpp b/multi_ctrls.cpp
deleted file mode 100644
index 3e83558ece..0000000000
--- a/multi_ctrls.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <cudaq.h>
-namespace cudaq {
-
-void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ {
-  h(c);
-  cx(b, c);
-  t<cudaq::adj>(c);
-  cx(a, c);
-  t(c);
-  cx(b, c);
-  t<cudaq::adj>(c);
-  cx(a, c);
-  t(b);
-  t(c);
-  h(c);
-  cx(a, b);
-  t(a);
-  t<cudaq::adj>(b);
-  cx(a, b);
-}
-
-void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux,
-                     int adjustment) __qpu__ {
-  for (int i = 0; i < ctls.size() - 1; i += 2) {
-    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
-  }
-  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
-    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
-  }
-}
-
-void CollectControls(
-    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
-    cudaq::qview<> aux, int adjustment) __qpu__ {
-  for (int i = 0; i < ctls.size() - 1; i += 2) {
-    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
-  }
-  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
-    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
-  }
-}
-
-void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ {
-  if (ctls.size() % 2 != 0)
-    CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]);
-}
-
-template <size_t V, typename... T>
-decltype(auto) getParameterPackVals(T &&...Args) noexcept {
-  return std::get<V>(std::forward_as_tuple(std::forward<T>(Args)...));
-}
-
-template <typename mod, typename... QubitTy>
-void x(cudaq::qubit& c0, cudaq::qubit& c1, QubitTy &...qubits) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  static constexpr std::size_t qubitCount = sizeof...(qubits) + 2;
-  static constexpr std::size_t numCtrls = qubitCount - 1;
-  static_assert(numCtrls > 1);
-  if constexpr (numCtrls == 2) {
-    CCNOT(c0,
-          c1,
-          getParameterPackVals<0>(qubits...));
-  } else {
-    cudaq::qvector aux(numCtrls - 2);
-    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};
-    ctls.pop_back();
-    ctls.emplace_back(c1);
-    ctls.emplace_back(c0);
-    assert(ctls.size() == numCtrls);
-    cudaq::compute_action(
-        [&]() { CollectControls(ctls, aux, 1 - (ctls.size() % 2)); },
-        [&]() {
-          if (ctls.size() % 2 != 0) {
-            CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
-          } else {
-            CCNOT(aux[ctls.size() - 3], aux[ctls.size() - 4], getParameterPackVals<sizeof...(qubits) - 1>(qubits...));
-          }
-        });
-  }
-}
-} // namespace cudaq
-
-int main() {
-
-  auto kernel = []() __qpu__ {
-    cudaq::qarray<5> q;
-    x(q);
-    x<cudaq::ctrl>(q[0], q[1], q[2], q[3], q[4]);
-    mz(q);
-  };
-
-  auto counts = cudaq::sample(kernel);
-  counts.dump();
-
-  return 0;
-}
\ No newline at end of file
diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h
deleted file mode 100644
index 254eb0822a..0000000000
--- a/runtime/cudaq/kernels/decomposition/controlled_gates.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#pragma once
-
-#include <cudaq.h>
-#include <cudaq/utils/cudaq_utils.h>
-
-namespace cudaq {
-namespace internal {
-
-/// Applies the doubly controlled–NOT (CCNOT) gate to three qubits via
-/// decomposition.
-void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ {
-  h(c);
-  cx(b, c);
-  t<cudaq::adj>(c);
-  cx(a, c);
-  t(c);
-  cx(b, c);
-  t<cudaq::adj>(c);
-  cx(a, c);
-  t(b);
-  t(c);
-  h(c);
-  cx(a, b);
-  t(a);
-  t<cudaq::adj>(b);
-  cx(a, b);
-}
-
-/// Collects the given list of control qubits into one or two of the given
-/// auxiliary qubits, using all but the last qubits in the auxiliary list as
-/// scratch qubits.
-/// For example, if the controls list is 6 qubits, the auxiliary list must be 5
-/// qubits, and the state from the 6 control qubits will be collected into the
-/// last qubit of the auxiliary array.
-/// The adjustment is used to allow the caller to reduce or increase
-/// the number of times this is run based on the eventual number of control
-/// qubits needed.
-void CollectControls(
-    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
-    cudaq::qview<> aux, int adjustment) __qpu__ {
-  for (int i = 0; i < ctls.size() - 1; i += 2) {
-    CCNOT(ctls[i], ctls[i + 1], aux[i / 2]);
-  }
-  for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) {
-    CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
-  }
-}
-
-void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux,
-                     int adjustment) __qpu__ {
-  std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;
-  for (auto &q : ctls)
-    castedCtls.emplace_back(q);
-
-  CollectControls(castedCtls, aux, adjustment);
-}
-
-/// When collecting controls, if there is an uneven number of original control
-/// qubits then the last control and the second to last auxiliary will be
-/// collected into the last auxiliary.
-void AdjustForSingleControl(
-    const std::vector<std::reference_wrapper<cudaq::qubit>> &ctls,
-    cudaq::qview<> aux) __qpu__ {
-  if (ctls.size() % 2 != 0)
-    CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]);
-}
-
-template <typename mod>
-void x(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,
-       cudaq::qubit &target) __qpu__ {
-  static_assert(std::is_same_v<mod, cudaq::ctrl>);
-  const std::size_t numCtrls = ctrls.size();
-  if (numCtrls == 0) {
-    x(target);
-  } else if (numCtrls == 1) {
-    cx(ctrls[0].get(), target);
-  } else if (numCtrls == 2) {
-    CCNOT(ctrls[0], ctrls[1], target);
-  } else {
-    cudaq::qvector aux(numCtrls - 2);
-    cudaq::compute_action(
-        [&]() { CollectControls(ctrls, aux, 1 - (ctrls.size() % 2)); },
-        [&]() {
-          if (ctrls.size() % 2 != 0) {
-            CCNOT(ctrls[ctrls.size() - 1], aux[ctrls.size() - 3], target);
-          } else {
-            CCNOT(aux[ctrls.size() - 3], aux[ctrls.size() - 4], target);
-          }
-        });
-  }
-}
-
-#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(NAME)                        \
-  template <typename mod>                                                      \
-  void NAME(const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,    \
-            cudaq::qubit &target) __qpu__ {                                    \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    const std::size_t numCtrls = ctrls.size();                                 \
-    if (numCtrls == 0) {                                                       \
-      NAME(target);                                                            \
-    } else if (numCtrls == 1) {                                                \
-      NAME<cudaq::ctrl>(ctrls[0].get(), target);                               \
-    } else {                                                                   \
-      cudaq::qvector aux(numCtrls - 1);                                        \
-      cudaq::compute_action(                                                   \
-          [&]() {                                                              \
-            CollectControls(ctrls, aux, 0);                                    \
-            AdjustForSingleControl(ctrls, aux);                                \
-          },                                                                   \
-          [&]() { NAME<cudaq::ctrl>(aux[ctrls.size() - 2], target); });        \
-    }                                                                          \
-  }
-
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(h)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(y)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(z)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(t)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(s)
-
-#define CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(NAME)              \
-  template <typename mod, typename ScalarAngle>                                \
-  void NAME(ScalarAngle angle,                                                 \
-            const std::vector<std::reference_wrapper<cudaq::qubit>> &ctrls,    \
-            cudaq::qubit &target) __qpu__ {                                    \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    const std::size_t numCtrls = ctrls.size();                                 \
-    if (numCtrls == 0) {                                                       \
-      NAME(angle, target);                                                     \
-    } else if (numCtrls == 1) {                                                \
-      oneQubitSingleParameterApply<qubit_op::NAME##Op, mod>(                   \
-          angle, ctrls[0].get(), target);                                      \
-    } else {                                                                   \
-      cudaq::qvector aux(numCtrls - 1);                                        \
-      cudaq::compute_action(                                                   \
-          [&]() {                                                              \
-            CollectControls(ctrls, aux, 0);                                    \
-            AdjustForSingleControl(ctrls, aux);                                \
-          },                                                                   \
-          [&]() {                                                              \
-            oneQubitSingleParameterApply<qubit_op::NAME##Op, mod>(             \
-                angle, aux[ctrls.size() - 2], target);                         \
-          });                                                                  \
-    }                                                                          \
-  }
-
-CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rx)
-CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(ry)
-CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rz)
-CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(r1)
-} // namespace internal
-
-#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(NAME)                             \
-  template <typename mod, typename... QubitTy>                                 \
-  void NAME(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ {  \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};       \
-    /* Last qubit is the target */                                             \
-    ctls.pop_back();                                                           \
-    /*Add the two explicit qubits */                                           \
-    ctls.emplace_back(c1);                                                     \
-    ctls.emplace_back(c0);                                                     \
-    internal::NAME<mod>(                                                       \
-        ctls, cudaq::getParameterPackVals<sizeof...(qubits) - 1>(qubits...));  \
-  }                                                                            \
-  template <typename mod>                                                      \
-  void NAME(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ {              \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;              \
-    for (auto &q : ctrls)                                                      \
-      castedCtls.emplace_back(q);                                              \
-    internal::NAME<mod>(castedCtls, target);                                   \
-  }
-
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(h)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(x)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(y)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(z)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(t)
-CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(s)
-
-#define CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(NAME)                   \
-  template <typename mod, typename ScalarAngle, typename... QubitTy>           \
-  void NAME(ScalarAngle angle, cudaq::qubit &c0, cudaq::qubit &c1,             \
-            QubitTy &...qubits) __qpu__ {                                      \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    static_assert(sizeof...(qubits) > 0);                                      \
-    std::vector<std::reference_wrapper<cudaq::qubit>> ctls{{qubits...}};       \
-    /* Last qubit is the target */                                             \
-    cudaq::qubit &target = ctls.back();                                        \
-    ctls.pop_back();                                                           \
-    /*Add the two explicit qubits */                                           \
-    ctls.emplace_back(c1);                                                     \
-    ctls.emplace_back(c0);                                                     \
-    internal::NAME<mod>(angle, ctls, target);                                  \
-  }                                                                            \
-  template <typename mod, typename ScalarAngle>                                \
-  void NAME(ScalarAngle angle, cudaq::qview<> ctrls, cudaq::qubit &target)     \
-      __qpu__ {                                                                \
-    static_assert(std::is_same_v<mod, cudaq::ctrl>);                           \
-    std::vector<std::reference_wrapper<cudaq::qubit>> castedCtls;              \
-    for (auto &q : ctrls)                                                      \
-      castedCtls.emplace_back(q);                                              \
-    internal::NAME<mod>(angle, castedCtls, target);                            \
-  }
-
-CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rx)
-CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(ry)
-CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rz)
-CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(r1)
-} // namespace cudaq
diff --git a/runtime/cudaq/utils/cudaq_utils.h b/runtime/cudaq/utils/cudaq_utils.h
index 16d7d824d4..9e3280a232 100644
--- a/runtime/cudaq/utils/cudaq_utils.h
+++ b/runtime/cudaq/utils/cudaq_utils.h
@@ -188,13 +188,6 @@ void tuple_for_each_with_idx(TupleType &&t, FunctionType f) {
                           std::integral_constant<size_t, 0>());
 }
 
-// Utility function to access a parameter at index from a variadic parameter
-// pack.
-template <size_t Idx, typename... T>
-decltype(auto) getParameterPackVals(T &&...Args) noexcept {
-  return std::get<Idx>(std::forward_as_tuple(std::forward<T>(Args)...));
-}
-
 // Function check if file with given path+name exists
 inline bool fileExists(const std::string &name) {
   if (FILE *file = fopen(name.c_str(), "r")) {
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 22d8e30e8a..23f10564af 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -69,12 +69,12 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
     target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR})
     target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart)
     install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib)
+    file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n")
+    install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets)
   endmacro()
 
   nvqir_create_cutn_plugin(tensornet ${BASE_TENSOR_BACKEND_SRS} simulator_tensornet_register.cpp )
   nvqir_create_cutn_plugin(tensornet-mps ${BASE_TENSOR_BACKEND_SRS} simulator_mps_register.cpp)
-  add_target_config(tensornet)
-  add_target_config(tensornet-mps)
   add_library(tensornet-mpi-util OBJECT mpi_support.cpp)
   target_include_directories(tensornet-mpi-util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/runtime)
   target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only)
diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config
deleted file mode 100644
index 27ac2279b2..0000000000
--- a/runtime/nvqir/cutensornet/tensornet-mps.config
+++ /dev/null
@@ -1,11 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-NVQIR_SIMULATION_BACKEND="tensornet-mps"
-TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target"
-GPU_REQUIREMENTS="true"
diff --git a/runtime/nvqir/cutensornet/tensornet.config b/runtime/nvqir/cutensornet/tensornet.config
deleted file mode 100644
index 9f47ba1c88..0000000000
--- a/runtime/nvqir/cutensornet/tensornet.config
+++ /dev/null
@@ -1,11 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-NVQIR_SIMULATION_BACKEND="tensornet"
-TARGET_DESCRIPTION="cuTensorNet-based full tensor network contraction backend target"
-GPU_REQUIREMENTS="true"
diff --git a/unittests/integration/gate_library_tester.cpp b/unittests/integration/gate_library_tester.cpp
index 3d976199f5..af531e936c 100644
--- a/unittests/integration/gate_library_tester.cpp
+++ b/unittests/integration/gate_library_tester.cpp
@@ -88,7 +88,6 @@ CUDAQ_TEST(GateLibraryTester, checkGivensRotationKernelBuilder) {
   }
 }
 
-
 CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) {
   for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) {
     // Same check, with 2 control qubits

From 31f2b06e14ea287f7012d4d9f996b5cb50a18193 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 00:52:24 +0000
Subject: [PATCH 10/18] Code format

---
 .../cutensornet/simulator_mps_register.cpp      | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index ad53cdfb3c..5605c5dbfa 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -19,6 +19,7 @@ class SimulatorMPS : public SimulatorTensorNetBase {
   double m_relCutoff = 1e-5;
   std::vector<void *> m_mpsTensors_d;
   std::vector<std::size_t> m_auxQubitsForGateDecomp;
+
 public:
   SimulatorMPS() : SimulatorTensorNetBase() {
     if (auto *maxBondEnvVar = std::getenv("CUDAQ_MPS_MAX_BOND")) {
@@ -195,10 +196,11 @@ class SimulatorMPS : public SimulatorTensorNetBase {
 
     collectControls(controls, aux, 0);
     adjustForSingleControl(controls, aux);
-    
+
     // Add to the singly-controlled instruction queue
-    enqueueQuantumOperation<QuantumOperation>(params, {aux[controls.size() - 2]}, targets);
-   
+    enqueueQuantumOperation<QuantumOperation>(
+        params, {aux[controls.size() - 2]}, targets);
+
     adjustForSingleControl(controls, aux);
     collectControls(controls, aux, 0);
   }
@@ -268,11 +270,10 @@ class SimulatorMPS : public SimulatorTensorNetBase {
       decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
     }
   }
- 
-  void applyExpPauli(double theta,
-                             const std::vector<std::size_t> &controls,
-                             const std::vector<std::size_t> &qubitIds,
-                             const cudaq::spin_op &op) override {
+
+  void applyExpPauli(double theta, const std::vector<std::size_t> &controls,
+                     const std::vector<std::size_t> &qubitIds,
+                     const cudaq::spin_op &op) override {
     if (op.is_identity()) {
       if (controls.empty()) {
         // exp(i*theta*Id) is noop if this is not a controlled gate.

From 2f61e494002fac666fff59ac6b2da54ea5abcbc2 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 01:44:07 +0000
Subject: [PATCH 11/18] Test time reduction

---
 unittests/integration/builder_tester.cpp  | 17 ++++++++---------
 unittests/integration/gradient_tester.cpp |  5 ++++-
 unittests/integration/nlopt_tester.cpp    |  6 +++++-
 unittests/integration/vqe_tester.cpp      |  5 ++++-
 unittests/qir/NVQIRTester.cpp             |  9 ++++++++-
 5 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index abdac9b870..0b7a554fb6 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -63,14 +63,14 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
     };
     cudaq::gradients::central_difference gradient(ansatz, argMapper);
     cudaq::optimizers::lbfgs optimizer;
-    optimizer.initial_parameters = {0.35, 0.25};
-    optimizer.max_eval = 10;
-    optimizer.max_line_search_trials = 10;
+    optimizer.initial_parameters = {0.359, 0.257};
+    optimizer.max_eval = 4;
+    optimizer.max_line_search_trials = 8;
     auto [opt_val_0, optpp] =
         cudaq::vqe(ansatz, gradient, h3, optimizer, 2, argMapper);
     printf("Opt-params: %lf %lf \n", optpp[0], optpp[1]);
     printf("<H3> = %lf\n", opt_val_0);
-    EXPECT_NEAR(opt_val_0, -2.045375, 1e-3);
+    EXPECT_NEAR(opt_val_0, -2.045375, 1e-2);
   }
 
   {
@@ -97,12 +97,12 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
 
     cudaq::gradients::central_difference gradient(ansatz);
     cudaq::optimizers::lbfgs optimizer;
-    optimizer.initial_parameters = {0.35, 0.25};
-    optimizer.max_eval = 10;
-    optimizer.max_line_search_trials = 10;
+    optimizer.initial_parameters = {0.359, 0.257};
+    optimizer.max_eval = 4;
+    optimizer.max_line_search_trials = 8;
     auto [opt_val_0, optpp] = cudaq::vqe(ansatz, gradient, h3, optimizer, 2);
     printf("<H3> = %lf\n", opt_val_0);
-    EXPECT_NEAR(opt_val_0, -2.045375, 1e-3);
+    EXPECT_NEAR(opt_val_0, -2.045375, 1e-2);
   }
 
   {
@@ -125,7 +125,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) {
     EXPECT_EQ(counter, 1000);
   }
 
-  // MPS doesn't support gates on more than 2 qubits
   {
     auto ccnot_builder = cudaq::make_kernel();
     auto q = ccnot_builder.qalloc(3);
diff --git a/unittests/integration/gradient_tester.cpp b/unittests/integration/gradient_tester.cpp
index 2211925c54..49e03ecf28 100644
--- a/unittests/integration/gradient_tester.cpp
+++ b/unittests/integration/gradient_tester.cpp
@@ -11,7 +11,10 @@
 #include <cudaq/algorithms/gradients/central_difference.h>
 #include <cudaq/optimizers.h>
 
-#ifndef CUDAQ_BACKEND_DM
+// Skip these gradient tests for slow backends to reduce test time.
+// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all
+// backend-specific functionalities required to interface gradient modules.
+#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET
 struct deuteron_n3_ansatz {
   void operator()(double x0, double x1) __qpu__ {
     cudaq::qvector q(3);
diff --git a/unittests/integration/nlopt_tester.cpp b/unittests/integration/nlopt_tester.cpp
index ea745cad18..197d6ffc7e 100644
--- a/unittests/integration/nlopt_tester.cpp
+++ b/unittests/integration/nlopt_tester.cpp
@@ -13,7 +13,11 @@
 #include <cudaq/algorithms/gradients/central_difference.h>
 #include <cudaq/optimizers.h>
 
-#ifndef CUDAQ_BACKEND_DM
+// Skip these Nlopt optimizer tests for slow backends to reduce test time.
+// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all
+// backend-specific functionalities required to interface with optimizers.
+#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET
+
 struct deuteron_n3_ansatz {
   void operator()(double x0, double x1) __qpu__ {
     cudaq::qvector q(3);
diff --git a/unittests/integration/vqe_tester.cpp b/unittests/integration/vqe_tester.cpp
index 6d18db99e9..5e4b5b9389 100644
--- a/unittests/integration/vqe_tester.cpp
+++ b/unittests/integration/vqe_tester.cpp
@@ -13,7 +13,10 @@
 #include <cudaq/algorithms/gradients/central_difference.h>
 #include <cudaq/optimizers.h>
 
-#ifndef CUDAQ_BACKEND_DM
+// Skip these VQE tests for slow backends to reduce test time.
+// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all
+// backend-specific functionalities required for the `cudaq::vqe` wrapper.
+#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET
 
 struct ansatz_compute_action {
   void operator()(std::vector<double> theta) __qpu__ {
diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index f287877573..d8008e6590 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -160,8 +160,15 @@ CUDAQ_TEST(NVQIRTester, checkReset) {
   Qubit *q1 = *reinterpret_cast<Qubit **>(
       __quantum__rt__array_get_element_ptr_1d(qubits, 1));
 
+#if defined CUDAQ_BACKEND_TENSORNET
+  // Tensornet backends doesn't have a qubit count limit, just check that it can
+  // perform qubit reset in a loop.
+  constexpr int N_ITERS = 3;
+#else
+  constexpr int N_ITERS = 100;
+#endif
   // Make sure that the state vector doesn't grow with each additional reset
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < N_ITERS; i++) {
     __quantum__qis__reset(q0);
     __quantum__qis__reset(q1);
     __quantum__qis__x(q1);

From 0cbfcc6eff5a92a1347196651a67f78a2d0984a4 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 02:07:35 +0000
Subject: [PATCH 12/18] More test time reduction

---
 .../integration/bug67_vqe_then_sample.cpp     |  2 +-
 .../deuteron_variational_tester.cpp           | 28 +++++++++++--------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/unittests/integration/bug67_vqe_then_sample.cpp b/unittests/integration/bug67_vqe_then_sample.cpp
index 42ea18dfdc..2dbac5ba53 100644
--- a/unittests/integration/bug67_vqe_then_sample.cpp
+++ b/unittests/integration/bug67_vqe_then_sample.cpp
@@ -12,7 +12,7 @@
 #include <cudaq/optimizers.h>
 #include <cudaq/platform.h>
 
-#ifndef CUDAQ_BACKEND_DM
+#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET
 
 CUDAQ_TEST(VqeThenSample, checkBug67) {
 
diff --git a/unittests/integration/deuteron_variational_tester.cpp b/unittests/integration/deuteron_variational_tester.cpp
index 048080d455..74d8585671 100644
--- a/unittests/integration/deuteron_variational_tester.cpp
+++ b/unittests/integration/deuteron_variational_tester.cpp
@@ -58,18 +58,13 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) {
   cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
                      .21829 * z(0) - 6.125 * z(1);
 
+#if defined CUDAQ_BACKEND_TENSORNET
+  // Reduce test time by reducing the broadcast size.
+  std::vector<double> params{-M_PI, -M_PI + M_2_PI / 49,
+                             -M_PI + 2 * M_2_PI / 49};
+  std::vector<double> expected{12.250290, 12.746370, 13.130148};
+#else
   auto params = cudaq::linspace(-M_PI, M_PI, 50);
-
-  auto ansatz = [](double theta, int size) __qpu__ {
-    cudaq::qvector q(size);
-    x(q[0]);
-    ry(theta, q[1]);
-    x<cudaq::ctrl>(q[1], q[0]);
-  };
-
-  auto results = cudaq::observe(
-      ansatz, h, cudaq::make_argset(params, std::vector(params.size(), 2)));
-
   std::vector<double> expected{
       12.250290, 12.746370, 13.130148, 13.395321, 13.537537, 13.554460,
       13.445811, 13.213375, 12.860969, 12.394379, 11.821267, 11.151042,
@@ -80,6 +75,17 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) {
       1.031106,  1.825915,  2.687735,  3.602415,  4.554937,  5.529659,
       6.510578,  7.481585,  8.426738,  9.330517,  10.178082, 10.955516,
       11.650053, 12.250290};
+#endif
+
+  auto ansatz = [](double theta, int size) __qpu__ {
+    cudaq::qvector q(size);
+    x(q[0]);
+    ry(theta, q[1]);
+    x<cudaq::ctrl>(q[1], q[0]);
+  };
+
+  auto results = cudaq::observe(
+      ansatz, h, cudaq::make_argset(params, std::vector(params.size(), 2)));
 
   for (std::size_t counter = 0; auto &el : expected)
     printf("results[%lu] = %.16lf\n", counter++, el);

From 48b037ea47d685d87de3eab4f9b7d884dc76a4cb Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 02:32:02 +0000
Subject: [PATCH 13/18] Wrong formula

---
 unittests/integration/deuteron_variational_tester.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unittests/integration/deuteron_variational_tester.cpp b/unittests/integration/deuteron_variational_tester.cpp
index 74d8585671..5f96059930 100644
--- a/unittests/integration/deuteron_variational_tester.cpp
+++ b/unittests/integration/deuteron_variational_tester.cpp
@@ -60,8 +60,8 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) {
 
 #if defined CUDAQ_BACKEND_TENSORNET
   // Reduce test time by reducing the broadcast size.
-  std::vector<double> params{-M_PI, -M_PI + M_2_PI / 49,
-                             -M_PI + 2 * M_2_PI / 49};
+  std::vector<double> params{-M_PI, -M_PI + 2. * M_PI / 49.,
+                             -M_PI + 4. * M_PI / 49.};
   std::vector<double> expected{12.250290, 12.746370, 13.130148};
 #else
   auto params = cudaq::linspace(-M_PI, M_PI, 50);

From 445d7a6c17f583bd65668f30d86e292b70741f96 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 04:01:27 +0000
Subject: [PATCH 14/18] Tidy up the code

---
 .../cutensornet/simulator_mps_register.cpp    | 83 ++++++++++++-------
 runtime/nvqir/cutensornet/tensornet_state.h   |  3 +-
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index 5605c5dbfa..ed3dac968f 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -18,6 +18,7 @@ class SimulatorMPS : public SimulatorTensorNetBase {
   // Default relative cutoff
   double m_relCutoff = 1e-5;
   std::vector<void *> m_mpsTensors_d;
+  // List of auxiliary qubits that were used for controlled-gate decomposition.
   std::vector<std::size_t> m_auxQubitsForGateDecomp;
 
 public:
@@ -127,17 +128,21 @@ class SimulatorMPS : public SimulatorTensorNetBase {
         (1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size()));
     const std::vector<int32_t> projectedModes(m_auxQubitsForGateDecomp.begin(),
                                               m_auxQubitsForGateDecomp.end());
+    // Returns the main qubit register state (auxiliary qubits are projected to
+    // zero state)
     return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)};
   }
 
-  size_t addAuxQubit() {
+  std::vector<size_t> addAuxQubits(std::size_t n) {
     if (m_state->isDirty())
       throw std::runtime_error(
           "[MPS Simulator] Unable to perform multi-control gate decomposition "
           "due to dynamical circuits.");
-    m_state = std::make_unique<TensorNetState>(m_state->getNumQubits() + 1,
+    std::vector<size_t> aux(n);
+    std::iota(aux.begin(), aux.end(), m_state->getNumQubits());
+    m_state = std::make_unique<TensorNetState>(m_state->getNumQubits() + n,
                                                m_cutnHandle);
-    return m_state->getNumQubits() - 1;
+    return aux;
   }
 
   template <typename QuantumOperation>
@@ -150,6 +155,7 @@ class SimulatorMPS : public SimulatorTensorNetBase {
       return;
     }
 
+    // CCNOT decomposition
     const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) {
       enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
       enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
@@ -168,43 +174,55 @@ class SimulatorMPS : public SimulatorTensorNetBase {
       enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
     };
 
+    // Collects the given list of control qubits into the given auxiliary
+    // qubits, using all but the last qubits in the auxiliary list as scratch
+    // qubits.
+    //
+    // For example, if the controls list is 6 qubits, the auxiliary list must be
+    // 5 qubits, and the state from the 6 control qubits will be collected into
+    // the last qubit of the auxiliary array.
     const auto collectControls = [&](const std::vector<std::size_t> &ctls,
                                      const std::vector<std::size_t> &aux,
-                                     int adjustment) {
-      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2) {
-        ccnot(ctls[i], ctls[i + 1], aux[i / 2]);
-      }
-      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1 - adjustment;
-           ++i) {
-        ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]);
-      }
+                                     bool reverse = false) {
+      std::vector<std::tuple<std::size_t, std::size_t, std::size_t>> ccnotList;
+      for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2)
+        ccnotList.emplace_back(
+            std::make_tuple(ctls[i], ctls[i + 1], aux[i / 2]));
+
+      for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1; ++i)
+        ccnotList.emplace_back(std::make_tuple(aux[i * 2], aux[(i * 2) + 1],
+                                               aux[i + ctls.size() / 2]));
+
+      if (ctls.size() % 2 != 0)
+        ccnotList.emplace_back(std::make_tuple(
+            ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]));
+
+      if (reverse)
+        std::reverse(ccnotList.begin(), ccnotList.end());
+
+      for (const auto &[a, b, c] : ccnotList)
+        ccnot(a, b, c);
     };
-    const auto adjustForSingleControl =
-        [&](const std::vector<std::size_t> &ctls,
-            const std::vector<std::size_t> &aux) {
-          if (ctls.size() % 2 != 0)
-            ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3],
-                  aux[ctls.size() - 2]);
-        };
-
-    std::vector<std::size_t> aux;
-    for (std::size_t i = 0; i < controls.size() - 1; ++i) {
-      const auto auxQubit = addAuxQubit();
-      m_auxQubitsForGateDecomp.emplace_back(auxQubit);
-      aux.emplace_back(auxQubit);
+
+    if (m_auxQubitsForGateDecomp.size() < controls.size() - 1) {
+      const auto aux =
+          addAuxQubits(controls.size() - 1 - m_auxQubitsForGateDecomp.size());
+      m_auxQubitsForGateDecomp.insert(m_auxQubitsForGateDecomp.end(),
+                                      aux.begin(), aux.end());
     }
 
-    collectControls(controls, aux, 0);
-    adjustForSingleControl(controls, aux);
+    collectControls(controls, m_auxQubitsForGateDecomp);
 
     // Add to the singly-controlled instruction queue
     enqueueQuantumOperation<QuantumOperation>(
-        params, {aux[controls.size() - 2]}, targets);
+        params, {m_auxQubitsForGateDecomp[controls.size() - 2]}, targets);
 
-    adjustForSingleControl(controls, aux);
-    collectControls(controls, aux, 0);
-  }
+    collectControls(controls, m_auxQubitsForGateDecomp, true);
+  };
 
+// Gate implementations:
+// Here, we forward all the call to the multi-control decomposition helper.
+// Decomposed gates are added to the queue.
 #define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME)                                      \
   using CircuitSimulator::NAME;                                                \
   void NAME(const std::vector<std::size_t> &controls,                          \
@@ -249,11 +267,14 @@ class SimulatorMPS : public SimulatorTensorNetBase {
 #undef CIRCUIT_SIMULATOR_ONE_QUBIT
 #undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM
 
+  // Swap gate implementation
   using CircuitSimulator::swap;
   void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
             const std::size_t tgtIdx) override {
     if (ctrlBits.empty())
       return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx);
+    // Controlled swap gate: using cnot decomposition of swap gate to perform
+    // decomposition.
     {
       std::vector<std::size_t> ctls = ctrlBits;
       ctls.emplace_back(tgtIdx);
@@ -271,6 +292,8 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     }
   }
 
+  // `exp-pauli` gate implementation: forward the middle-controlled Rz to the
+  // decomposition helper.
   void applyExpPauli(double theta, const std::vector<std::size_t> &controls,
                      const std::vector<std::size_t> &qubitIds,
                      const cudaq::spin_op &op) override {
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index 7416b3807b..89992eedad 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -19,6 +19,7 @@ class TensorNetState {
   std::size_t m_numQubits;
   cutensornetHandle_t m_cutnHandle;
   cutensornetState_t m_quantumState;
+  // Track id of tensors that are applied to the state tensors.
   int64_t m_tensorId = -1;
 
 public:
@@ -75,7 +76,7 @@ class TensorNetState {
 
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }
-  
+
   /// @brief True if the state contains gate tensors (not just initial qubit
   /// tensors)
   bool isDirty() const { return m_tensorId > 0; }

From 3e392b95114bd3bb06d174758fdab9e01ef45d13 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 04:31:54 +0000
Subject: [PATCH 15/18] Code reorder

---
 runtime/nvqir/cutensornet/simulator_mps_register.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index ed3dac968f..043715ff05 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -111,9 +111,9 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     m_mpsTensors_d.clear();
   }
 
-  void resetExecutionContext() override {
-    SimulatorTensorNetBase::resetExecutionContext();
+  void deallocateStateImpl() override {
     m_auxQubitsForGateDecomp.clear();
+    SimulatorTensorNetBase::deallocateStateImpl();
   }
 
   /// @brief Return the state vector data

From 80a8850e7795c1f2780df85fb25dacdf9958380e Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 04:37:47 +0000
Subject: [PATCH 16/18] Typo fix

---
 unittests/qir/NVQIRTester.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index d8008e6590..90da808f38 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -161,7 +161,7 @@ CUDAQ_TEST(NVQIRTester, checkReset) {
       __quantum__rt__array_get_element_ptr_1d(qubits, 1));
 
 #if defined CUDAQ_BACKEND_TENSORNET
-  // Tensornet backends doesn't have a qubit count limit, just check that it can
+  // Tensornet backends don't have a qubit count limit, just check that it can
   // perform qubit reset in a loop.
   constexpr int N_ITERS = 3;
 #else

From d62007a492ba73558acebef6e37b0eabbfc2593e Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 20 Mar 2024 21:06:20 +0000
Subject: [PATCH 17/18] Code review: optimize control qubit vector usage

---
 runtime/nvqir/cutensornet/simulator_mps_register.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index 043715ff05..35a8082990 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -275,19 +275,19 @@ class SimulatorMPS : public SimulatorTensorNetBase {
       return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx);
     // Controlled swap gate: using cnot decomposition of swap gate to perform
     // decomposition.
+    const auto size = ctrlBits.size();
+    std::vector<std::size_t> ctls(size + 1);
+    std::copy(ctrlBits.begin(), ctrlBits.end(), ctls.begin());
     {
-      std::vector<std::size_t> ctls = ctrlBits;
-      ctls.emplace_back(tgtIdx);
+      ctls[size] = tgtIdx;
       decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
     }
     {
-      std::vector<std::size_t> ctls = ctrlBits;
-      ctls.emplace_back(srcIdx);
+      ctls[size] = srcIdx;
       decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {tgtIdx});
     }
     {
-      std::vector<std::size_t> ctls = ctrlBits;
-      ctls.emplace_back(tgtIdx);
+      ctls[size] = tgtIdx;
       decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
     }
   }

From bdce5e63d490551986f0191b6cc5fa45e943b527 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Mon, 1 Apr 2024 19:46:28 +0000
Subject: [PATCH 18/18] Add InvalidTensorIndexValue constant for clarity

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 runtime/nvqir/cutensornet/tensornet_state.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index 89992eedad..5cf9afbf57 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -13,14 +13,18 @@
 #include <unordered_map>
 
 namespace nvqir {
+/// This is used to track whether the tensor state is default initialized vs
+/// already has some gates applied to.
+constexpr std::int64_t InvalidTensorIndexValue = -1;
+
 /// @brief Wrapper of cutensornetState_t to provide convenient API's for CUDAQ
 /// simulator implementation.
 class TensorNetState {
   std::size_t m_numQubits;
   cutensornetHandle_t m_cutnHandle;
   cutensornetState_t m_quantumState;
-  // Track id of tensors that are applied to the state tensors.
-  int64_t m_tensorId = -1;
+  /// Track id of gate tensors that are applied to the state tensors.
+  std::int64_t m_tensorId = InvalidTensorIndexValue;
 
 public:
   /// @brief Constructor