From b28105e3e3767d72d1cd42e13377eee02da04404 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Thu, 14 Mar 2024 07:45:34 +0000 Subject: [PATCH 01/18] Prototype multi-ctrl via lib --- multi_ctrls.cpp | 96 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 multi_ctrls.cpp diff --git a/multi_ctrls.cpp b/multi_ctrls.cpp new file mode 100644 index 0000000000..3e83558ece --- /dev/null +++ b/multi_ctrls.cpp @@ -0,0 +1,96 @@ +#include +namespace cudaq { + +void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ { + h(c); + cx(b, c); + t(c); + cx(a, c); + t(c); + cx(b, c); + t(c); + cx(a, c); + t(b); + t(c); + h(c); + cx(a, b); + t(a); + t(b); + cx(a, b); +} + +void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux, + int adjustment) __qpu__ { + for (int i = 0; i < ctls.size() - 1; i += 2) { + CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); + } + for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { + CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); + } +} + +void CollectControls( + const std::vector> &ctls, + cudaq::qview<> aux, int adjustment) __qpu__ { + for (int i = 0; i < ctls.size() - 1; i += 2) { + CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); + } + for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { + CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); + } +} + +void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ { + if (ctls.size() % 2 != 0) + CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]); +} + +template +decltype(auto) getParameterPackVals(T &&...Args) noexcept { + return std::get(std::forward_as_tuple(std::forward(Args)...)); +} + +template +void x(cudaq::qubit& c0, cudaq::qubit& c1, QubitTy &...qubits) __qpu__ { + static_assert(std::is_same_v); + static constexpr std::size_t qubitCount = sizeof...(qubits) + 2; + static constexpr std::size_t numCtrls = qubitCount - 1; + static_assert(numCtrls > 1); + if constexpr (numCtrls == 2) { + CCNOT(c0, + c1, + getParameterPackVals<0>(qubits...)); + } else { + cudaq::qvector aux(numCtrls - 2); + std::vector> ctls{{qubits...}}; + ctls.pop_back(); + ctls.emplace_back(c1); + ctls.emplace_back(c0); + assert(ctls.size() == numCtrls); + cudaq::compute_action( + [&]() { CollectControls(ctls, aux, 1 - (ctls.size() % 2)); }, + [&]() { + if (ctls.size() % 2 != 0) { + CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], getParameterPackVals(qubits...)); + } else { + CCNOT(aux[ctls.size() - 3], aux[ctls.size() - 4], getParameterPackVals(qubits...)); + } + }); + } +} +} // namespace cudaq + +int main() { + + auto kernel = []() __qpu__ { + cudaq::qarray<5> q; + x(q); + x(q[0], q[1], q[2], q[3], q[4]); + mz(q); + }; + + auto counts = cudaq::sample(kernel); + counts.dump(); + + return 0; +} \ No newline at end of file From f696fba2aa3d87d44dc180344fcb9304128d953d Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Fri, 15 Mar 2024 02:44:13 +0000 Subject: [PATCH 02/18] Control gate decomp lib --- .../kernels/decomposition/controlled_gates.h | 121 ++++++++++++++++++ runtime/cudaq/utils/cudaq_utils.h | 7 + 2 files changed, 128 insertions(+) create mode 100644 runtime/cudaq/kernels/decomposition/controlled_gates.h diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h new file mode 100644 index 0000000000..0625032e5a --- /dev/null +++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h @@ -0,0 +1,121 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include + +namespace cudaq { +namespace internal { + +/// Applies the doubly controlled–NOT (CCNOT) gate to three qubits via +/// decomposition. +void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ { + h(c); + cx(b, c); + t(c); + cx(a, c); + t(c); + cx(b, c); + t(c); + cx(a, c); + t(b); + t(c); + h(c); + cx(a, b); + t(a); + t(b); + cx(a, b); +} + +/// Collects the given list of control qubits into one or two of the given +/// auxiliary qubits, using all but the last qubits in the auxiliary list as +/// scratch qubits. +/// For example, if the controls list is 6 qubits, the auxiliary list must be 5 +/// qubits, and the state from the 6 control qubits will be collected into the +/// last qubit of the auxiliary array. +/// The adjustment is used to allow the caller to reduce or increase +/// the number of times this is run based on the eventual number of control +/// qubits needed. +void CollectControls( + const std::vector> &ctls, + cudaq::qview<> aux, int adjustment) __qpu__ { + for (int i = 0; i < ctls.size() - 1; i += 2) { + CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); + } + for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { + CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); + } +} + +void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux, + int adjustment) __qpu__ { + std::vector> castedCtls; + for (auto &q : ctls) + castedCtls.emplace_back(q); + + CollectControls(castedCtls, aux, adjustment); +} + +/// When collecting controls, if there is an uneven number of original control +/// qubits then the last control and the second to last auxiliary will be +/// collected into the last auxiliary. +void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ { + if (ctls.size() % 2 != 0) + CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]); +} + +template +void x(const std::vector> &ctrls, + cudaq::qubit &target) __qpu__ { + static_assert(std::is_same_v); + const std::size_t numCtrls = ctrls.size(); + if (numCtrls == 0) { + x(target); + } else if (numCtrls == 1) { + cx(ctrls[0], target); + } else if (numCtrls == 2) { + CCNOT(ctrls[0], ctrls[1], target); + } else { + cudaq::qvector aux(numCtrls - 2); + cudaq::compute_action( + [&]() { CollectControls(ctrls, aux, 1 - (ctrls.size() % 2)); }, + [&]() { + if (ctrls.size() % 2 != 0) { + CCNOT(ctrls[ctrls.size() - 1], aux[ctrls.size() - 3], target); + } else { + CCNOT(aux[ctrls.size() - 3], aux[ctrls.size() - 4], target); + } + }); + } +} +} // namespace internal + +template +void x(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { + static_assert(std::is_same_v); + std::vector> ctls{{qubits...}}; + // Last qubit is the target + ctls.pop_back(); + // Add the two explicit qubits + ctls.emplace_back(c1); + ctls.emplace_back(c0); + internal::x( + ctls, cudaq::getParameterPackVals(qubits...)); +} + +template +void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { + static_assert(std::is_same_v); + std::vector> castedCtls; + for (auto &q : ctrls) + castedCtls.emplace_back(q); + internal::x(castedCtls, target); +} +} // namespace cudaq diff --git a/runtime/cudaq/utils/cudaq_utils.h b/runtime/cudaq/utils/cudaq_utils.h index 9e3280a232..16d7d824d4 100644 --- a/runtime/cudaq/utils/cudaq_utils.h +++ b/runtime/cudaq/utils/cudaq_utils.h @@ -188,6 +188,13 @@ void tuple_for_each_with_idx(TupleType &&t, FunctionType f) { std::integral_constant()); } +// Utility function to access a parameter at index from a variadic parameter +// pack. +template +decltype(auto) getParameterPackVals(T &&...Args) noexcept { + return std::get(std::forward_as_tuple(std::forward(Args)...)); +} + // Function check if file with given path+name exists inline bool fileExists(const std::string &name) { if (FILE *file = fopen(name.c_str(), "r")) { From dd23431e54fc2a7fb719d71b3bb6fa453b22fff5 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Fri, 15 Mar 2024 03:36:26 +0000 Subject: [PATCH 03/18] Add z --- .../kernels/decomposition/controlled_gates.h | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h index 0625032e5a..92bf17d3d7 100644 --- a/runtime/cudaq/kernels/decomposition/controlled_gates.h +++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h @@ -66,7 +66,9 @@ void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux, /// When collecting controls, if there is an uneven number of original control /// qubits then the last control and the second to last auxiliary will be /// collected into the last auxiliary. -void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ { +void AdjustForSingleControl( + const std::vector> &ctls, + cudaq::qview<> aux) __qpu__ { if (ctls.size() % 2 != 0) CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]); } @@ -79,7 +81,7 @@ void x(const std::vector> &ctrls, if (numCtrls == 0) { x(target); } else if (numCtrls == 1) { - cx(ctrls[0], target); + cx(ctrls[0].get(), target); } else if (numCtrls == 2) { CCNOT(ctrls[0], ctrls[1], target); } else { @@ -95,6 +97,26 @@ void x(const std::vector> &ctrls, }); } } + +template +void z(const std::vector> &ctrls, + cudaq::qubit &target) __qpu__ { + static_assert(std::is_same_v); + const std::size_t numCtrls = ctrls.size(); + if (numCtrls == 0) { + z(target); + } else if (numCtrls == 1) { + z(ctrls[0].get(), target); + } else { + cudaq::qvector aux(numCtrls - 1); + cudaq::compute_action( + [&]() { + CollectControls(ctrls, aux, 0); + AdjustForSingleControl(ctrls, aux); + }, + [&]() { z(aux[ctrls.size() - 2], target); }); + } +} } // namespace internal template @@ -118,4 +140,26 @@ void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { castedCtls.emplace_back(q); internal::x(castedCtls, target); } + +template +void z(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { + static_assert(std::is_same_v); + std::vector> ctls{{qubits...}}; + // Last qubit is the target + ctls.pop_back(); + // Add the two explicit qubits + ctls.emplace_back(c1); + ctls.emplace_back(c0); + internal::z( + ctls, cudaq::getParameterPackVals(qubits...)); +} + +template +void z(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { + static_assert(std::is_same_v); + std::vector> castedCtls; + for (auto &q : ctrls) + castedCtls.emplace_back(q); + internal::z(castedCtls, target); +} } // namespace cudaq From 18886a5533fc5f999b40f5c9701bb53f745a350d Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Fri, 15 Mar 2024 05:47:49 +0000 Subject: [PATCH 04/18] Lib mode for all multi-control decomp --- .../kernels/decomposition/controlled_gates.h | 169 ++++++++++++------ 1 file changed, 111 insertions(+), 58 deletions(-) diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h index 92bf17d3d7..254eb0822a 100644 --- a/runtime/cudaq/kernels/decomposition/controlled_gates.h +++ b/runtime/cudaq/kernels/decomposition/controlled_gates.h @@ -98,68 +98,121 @@ void x(const std::vector> &ctrls, } } -template -void z(const std::vector> &ctrls, - cudaq::qubit &target) __qpu__ { - static_assert(std::is_same_v); - const std::size_t numCtrls = ctrls.size(); - if (numCtrls == 0) { - z(target); - } else if (numCtrls == 1) { - z(ctrls[0].get(), target); - } else { - cudaq::qvector aux(numCtrls - 1); - cudaq::compute_action( - [&]() { - CollectControls(ctrls, aux, 0); - AdjustForSingleControl(ctrls, aux); - }, - [&]() { z(aux[ctrls.size() - 2], target); }); +#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(NAME) \ + template \ + void NAME(const std::vector> &ctrls, \ + cudaq::qubit &target) __qpu__ { \ + static_assert(std::is_same_v); \ + const std::size_t numCtrls = ctrls.size(); \ + if (numCtrls == 0) { \ + NAME(target); \ + } else if (numCtrls == 1) { \ + NAME(ctrls[0].get(), target); \ + } else { \ + cudaq::qvector aux(numCtrls - 1); \ + cudaq::compute_action( \ + [&]() { \ + CollectControls(ctrls, aux, 0); \ + AdjustForSingleControl(ctrls, aux); \ + }, \ + [&]() { NAME(aux[ctrls.size() - 2], target); }); \ + } \ } -} + +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(h) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(y) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(z) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(t) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(s) + +#define CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(NAME) \ + template \ + void NAME(ScalarAngle angle, \ + const std::vector> &ctrls, \ + cudaq::qubit &target) __qpu__ { \ + static_assert(std::is_same_v); \ + const std::size_t numCtrls = ctrls.size(); \ + if (numCtrls == 0) { \ + NAME(angle, target); \ + } else if (numCtrls == 1) { \ + oneQubitSingleParameterApply( \ + angle, ctrls[0].get(), target); \ + } else { \ + cudaq::qvector aux(numCtrls - 1); \ + cudaq::compute_action( \ + [&]() { \ + CollectControls(ctrls, aux, 0); \ + AdjustForSingleControl(ctrls, aux); \ + }, \ + [&]() { \ + oneQubitSingleParameterApply( \ + angle, aux[ctrls.size() - 2], target); \ + }); \ + } \ + } + +CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rx) +CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(ry) +CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rz) +CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(r1) } // namespace internal -template -void x(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { - static_assert(std::is_same_v); - std::vector> ctls{{qubits...}}; - // Last qubit is the target - ctls.pop_back(); - // Add the two explicit qubits - ctls.emplace_back(c1); - ctls.emplace_back(c0); - internal::x( - ctls, cudaq::getParameterPackVals(qubits...)); -} +#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(NAME) \ + template \ + void NAME(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { \ + static_assert(std::is_same_v); \ + std::vector> ctls{{qubits...}}; \ + /* Last qubit is the target */ \ + ctls.pop_back(); \ + /*Add the two explicit qubits */ \ + ctls.emplace_back(c1); \ + ctls.emplace_back(c0); \ + internal::NAME( \ + ctls, cudaq::getParameterPackVals(qubits...)); \ + } \ + template \ + void NAME(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { \ + static_assert(std::is_same_v); \ + std::vector> castedCtls; \ + for (auto &q : ctrls) \ + castedCtls.emplace_back(q); \ + internal::NAME(castedCtls, target); \ + } -template -void x(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { - static_assert(std::is_same_v); - std::vector> castedCtls; - for (auto &q : ctrls) - castedCtls.emplace_back(q); - internal::x(castedCtls, target); -} +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(h) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(x) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(y) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(z) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(t) +CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(s) -template -void z(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { - static_assert(std::is_same_v); - std::vector> ctls{{qubits...}}; - // Last qubit is the target - ctls.pop_back(); - // Add the two explicit qubits - ctls.emplace_back(c1); - ctls.emplace_back(c0); - internal::z( - ctls, cudaq::getParameterPackVals(qubits...)); -} +#define CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(NAME) \ + template \ + void NAME(ScalarAngle angle, cudaq::qubit &c0, cudaq::qubit &c1, \ + QubitTy &...qubits) __qpu__ { \ + static_assert(std::is_same_v); \ + static_assert(sizeof...(qubits) > 0); \ + std::vector> ctls{{qubits...}}; \ + /* Last qubit is the target */ \ + cudaq::qubit &target = ctls.back(); \ + ctls.pop_back(); \ + /*Add the two explicit qubits */ \ + ctls.emplace_back(c1); \ + ctls.emplace_back(c0); \ + internal::NAME(angle, ctls, target); \ + } \ + template \ + void NAME(ScalarAngle angle, cudaq::qview<> ctrls, cudaq::qubit &target) \ + __qpu__ { \ + static_assert(std::is_same_v); \ + std::vector> castedCtls; \ + for (auto &q : ctrls) \ + castedCtls.emplace_back(q); \ + internal::NAME(angle, castedCtls, target); \ + } -template -void z(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { - static_assert(std::is_same_v); - std::vector> castedCtls; - for (auto &q : ctrls) - castedCtls.emplace_back(q); - internal::z(castedCtls, target); -} +CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rx) +CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(ry) +CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rz) +CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(r1) } // namespace cudaq From fa0c34a27db8025db021c07c8528b950d6031ca6 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Fri, 15 Mar 2024 06:08:37 +0000 Subject: [PATCH 05/18] Hook library-based decomp to MPS --- runtime/cudaq/qis/qubit_qis.h | 4 ++++ runtime/nvqir/cutensornet/CMakeLists.txt | 4 ++-- runtime/nvqir/cutensornet/tensornet-mps.config | 12 ++++++++++++ runtime/nvqir/cutensornet/tensornet.config | 11 +++++++++++ unittests/integration/ccnot_tester.cpp | 3 --- unittests/integration/gate_library_tester.cpp | 4 +--- unittests/integration/grover_test.cpp | 3 --- unittests/integration/negative_controls_tester.cpp | 4 +--- unittests/qis/QubitQISTester.cpp | 3 --- 9 files changed, 31 insertions(+), 17 deletions(-) create mode 100644 runtime/nvqir/cutensornet/tensornet-mps.config create mode 100644 runtime/nvqir/cutensornet/tensornet.config diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index 45ddb847b7..559b7d5521 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -633,3 +633,7 @@ std::vector slice_vector(std::vector &original, std::size_t start, } } // namespace cudaq + +#if defined(CUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION) +#include +#endif diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt index 23f10564af..22d8e30e8a 100644 --- a/runtime/nvqir/cutensornet/CMakeLists.txt +++ b/runtime/nvqir/cutensornet/CMakeLists.txt @@ -69,12 +69,12 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3") target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR}) target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart) install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib) - file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n") - install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets) endmacro() nvqir_create_cutn_plugin(tensornet ${BASE_TENSOR_BACKEND_SRS} simulator_tensornet_register.cpp ) nvqir_create_cutn_plugin(tensornet-mps ${BASE_TENSOR_BACKEND_SRS} simulator_mps_register.cpp) + add_target_config(tensornet) + add_target_config(tensornet-mps) add_library(tensornet-mpi-util OBJECT mpi_support.cpp) target_include_directories(tensornet-mpi-util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/runtime) target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only) diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config new file mode 100644 index 0000000000..2502f96e31 --- /dev/null +++ b/runtime/nvqir/cutensornet/tensornet-mps.config @@ -0,0 +1,12 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +NVQIR_SIMULATION_BACKEND="tensornet-mps" +TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target" +GPU_REQUIREMENTS="true" +COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION" diff --git a/runtime/nvqir/cutensornet/tensornet.config b/runtime/nvqir/cutensornet/tensornet.config new file mode 100644 index 0000000000..9f47ba1c88 --- /dev/null +++ b/runtime/nvqir/cutensornet/tensornet.config @@ -0,0 +1,11 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +NVQIR_SIMULATION_BACKEND="tensornet" +TARGET_DESCRIPTION="cuTensorNet-based full tensor network contraction backend target" +GPU_REQUIREMENTS="true" diff --git a/unittests/integration/ccnot_tester.cpp b/unittests/integration/ccnot_tester.cpp index 4d4a4220c8..4c1030fe34 100644 --- a/unittests/integration/ccnot_tester.cpp +++ b/unittests/integration/ccnot_tester.cpp @@ -54,8 +54,6 @@ struct nested_ctrl { } }; -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(CCNOTTester, checkSimple) { auto ccnot = []() { cudaq::qvector q(3); @@ -96,4 +94,3 @@ CUDAQ_TEST(FredkinTester, checkTruth) { EXPECT_EQ(counts.size(), 1); EXPECT_EQ(counts.begin()->first, "110"); } -#endif diff --git a/unittests/integration/gate_library_tester.cpp b/unittests/integration/gate_library_tester.cpp index 78de65f7d5..3d976199f5 100644 --- a/unittests/integration/gate_library_tester.cpp +++ b/unittests/integration/gate_library_tester.cpp @@ -88,8 +88,7 @@ CUDAQ_TEST(GateLibraryTester, checkGivensRotationKernelBuilder) { } } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits + CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) { for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) { // Same check, with 2 control qubits @@ -117,7 +116,6 @@ CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) { EXPECT_NEAR(std::abs(ss_01_off[0]), 1.0, 1e-6); } } -#endif CUDAQ_TEST(GateLibraryTester, checkFermionicSwap) { for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) { diff --git a/unittests/integration/grover_test.cpp b/unittests/integration/grover_test.cpp index be4fd3a374..3e64a749c7 100644 --- a/unittests/integration/grover_test.cpp +++ b/unittests/integration/grover_test.cpp @@ -48,8 +48,6 @@ struct oracle { } }; -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(GroverTester, checkNISQ) { using namespace cudaq; auto counts = cudaq::sample(1000, run_grover{}, 3, 1, oracle{}); @@ -62,4 +60,3 @@ CUDAQ_TEST(GroverTester, checkNISQ) { } EXPECT_EQ(counter, 1000); } -#endif diff --git a/unittests/integration/negative_controls_tester.cpp b/unittests/integration/negative_controls_tester.cpp index 6bd13b3404..c46e8ac48c 100644 --- a/unittests/integration/negative_controls_tester.cpp +++ b/unittests/integration/negative_controls_tester.cpp @@ -7,8 +7,7 @@ ******************************************************************************/ #include "CUDAQTestUtils.h" -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits + CUDAQ_TEST(NegativeControlsTester, checkSimple) { auto kernel = []() __qpu__ { @@ -63,4 +62,3 @@ CUDAQ_TEST(NegativeControlsTester, checkSimple) { EXPECT_EQ(counter, 1000); } -#endif diff --git a/unittests/qis/QubitQISTester.cpp b/unittests/qis/QubitQISTester.cpp index 8dc488d80d..cf92aa507a 100644 --- a/unittests/qis/QubitQISTester.cpp +++ b/unittests/qis/QubitQISTester.cpp @@ -157,8 +157,6 @@ CUDAQ_TEST(QubitQISTester, checkCommonKernel) { EXPECT_NEAR(energy, -1.7487, 1e-3); } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(QubitQISTester, checkCtrlRegion) { auto ccnot = []() { @@ -229,7 +227,6 @@ CUDAQ_TEST(QubitQISTester, checkCtrlRegion) { EXPECT_EQ(1, counts3.size()); EXPECT_TRUE(counts3.begin()->first == "101"); } -#endif CUDAQ_TEST(QubitQISTester, checkAdjointRegions) { struct single_adjoint_test { From 1cd53255cc0b48010267e4500dbb4e52f819b040 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Tue, 19 Mar 2024 08:02:12 +0000 Subject: [PATCH 06/18] Make multi-ctrl decomposition an execution manager since we need to intercept control --- runtime/nvqir/cutensornet/CMakeLists.txt | 18 + .../nvqir/cutensornet/MpsExecutionManager.cpp | 467 ++++++++++++++++++ .../nvqir/cutensornet/tensornet-mps.config | 1 + 3 files changed, 486 insertions(+) create mode 100644 runtime/nvqir/cutensornet/MpsExecutionManager.cpp diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt index 22d8e30e8a..9cb46fd27f 100644 --- a/runtime/nvqir/cutensornet/CMakeLists.txt +++ b/runtime/nvqir/cutensornet/CMakeLists.txt @@ -90,6 +90,24 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3") target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive) target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive) endif() + + add_library(cudaq-em-mps SHARED MpsExecutionManager.cpp) + set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-em-mps) + target_include_directories(cudaq-em-mps + PUBLIC + $ + $ + PRIVATE .) + + target_link_libraries(cudaq-em-mps + PUBLIC cudaq-spin PRIVATE nvqir cudaq-common fmt::fmt-header-only LLVMSupport) + + install(TARGETS cudaq-em-mps EXPORT cudaq-em-mps-targets DESTINATION lib) + + install(EXPORT cudaq-em-mps-targets + FILE CUDAQEmMpsTargets.cmake + NAMESPACE cudaq:: + DESTINATION lib/cmake/cudaq) else() message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.") endif() diff --git a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp new file mode 100644 index 0000000000..8245a0f131 --- /dev/null +++ b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp @@ -0,0 +1,467 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "common/Logger.h" +#include "cudaq/qis/managers/BasicExecutionManager.h" +#include "nvqir/CircuitSimulator.h" +#include "llvm/ADT/StringSwitch.h" + +namespace nvqir { +CircuitSimulator *getCircuitSimulatorInternal(); +} +namespace { +class MpsExecutionManager : public cudaq::BasicExecutionManager { +private: + nvqir::CircuitSimulator *simulator() { + return nvqir::getCircuitSimulatorInternal(); + } + + /// @brief To improve `qudit` allocation, we defer + /// single `qudit` allocation requests until the first + /// encountered `apply` call. + std::vector requestedAllocations; + std::vector auxQuditIdsForDeletion; + + std::vector + decomposeMultiControlledInstruction(const Instruction &instruction, + std::vector &aux) { + // Get the data, create the Qubit* targets + auto [gateName, parameters, controls, targets, op] = instruction; + if (controls.size() + targets.size() <= 2) { + return {instruction}; + } + std::vector decomposedInsts; + + const auto makeInstruction = + [](const std::string gateName, const std::vector &gateParams, + const std::vector &ctrls, + const std::vector &targets) -> Instruction { + return std::make_tuple(gateName, gateParams, ctrls, targets, cudaq::spin_op{}); + }; + + if (targets.size() > 1) { + if (gateName == "swap") { + { + auto mutableCtrls = controls; + mutableCtrls.emplace_back(targets[0]); + const auto insts = decomposeMultiControlledInstruction( + makeInstruction("x", {}, mutableCtrls, {targets[1]}), + auxQuditIdsForDeletion); + decomposedInsts.insert(decomposedInsts.end(), insts.begin(), + insts.end()); + } + { + auto mutableCtrls = controls; + mutableCtrls.emplace_back(targets[1]); + const auto insts = decomposeMultiControlledInstruction( + makeInstruction("x", {}, mutableCtrls, {targets[0]}), + auxQuditIdsForDeletion); + decomposedInsts.insert(decomposedInsts.end(), insts.begin(), + insts.end()); + } + { + auto mutableCtrls = controls; + mutableCtrls.emplace_back(targets[0]); + const auto insts = decomposeMultiControlledInstruction( + makeInstruction("x", {}, mutableCtrls, {targets[1]}), + auxQuditIdsForDeletion); + decomposedInsts.insert(decomposedInsts.end(), insts.begin(), + insts.end()); + } + return decomposedInsts; + } else if (gateName == "exp_pauli") { + if (controls.size() <= 1) { + return {instruction}; + } else { + std::vector qubitSupport; + std::vector> basisChange; + op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) { + if (type != cudaq::pauli::I) + qubitSupport.push_back(targets[qubitIdx]); + + if (type == cudaq::pauli::Y) + basisChange.emplace_back([&, qubitIdx](bool reverse) { + decomposedInsts.emplace_back( + makeInstruction("rx", {!reverse ? M_PI_2 : -M_PI_2}, {}, + {targets[qubitIdx]})); + }); + else if (type == cudaq::pauli::X) + basisChange.emplace_back([&, qubitIdx](bool) { + decomposedInsts.emplace_back( + makeInstruction("h", {}, {}, {targets[qubitIdx]})); + }); + }); + + if (!basisChange.empty()) + for (auto &basis : basisChange) + basis(false); + + std::vector> toReverse; + for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) { + decomposedInsts.emplace_back(makeInstruction( + "x", {}, {qubitSupport[i]}, {qubitSupport[i + 1]})); + toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]); + } + + // Since this is a compute-action-uncompute type circuit, we only need + // to apply control on this rz gate. + { + const auto mcRzInsts = decomposeMultiControlledInstruction( + makeInstruction("rz", {-2.0 * parameters[0]}, controls, + {qubitSupport.back()}), + auxQuditIdsForDeletion); + decomposedInsts.insert(decomposedInsts.end(), mcRzInsts.begin(), + mcRzInsts.end()); + } + + std::reverse(toReverse.begin(), toReverse.end()); + for (auto &[i, j] : toReverse) + decomposedInsts.emplace_back(makeInstruction("x", {}, {i}, {j})); + + if (!basisChange.empty()) { + std::reverse(basisChange.begin(), basisChange.end()); + for (auto &basis : basisChange) + basis(true); + } + return decomposedInsts; + } + } else { + throw std::runtime_error("Unsupported: " + gateName); + } + } + + const auto ccnot = [&](cudaq::QuditInfo &a, cudaq::QuditInfo &b, + cudaq::QuditInfo &c) { + decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c})); + decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c})); + decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c})); + decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c})); + decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {b})); + decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b})); + decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {a})); + decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {b})); + decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b})); + }; + + const auto collectControls = [&](std::vector &ctls, + std::vector &aux, + int adjustment) { + for (int i = 0; i < static_cast(ctls.size()) - 1; i += 2) { + ccnot(ctls[i], ctls[i + 1], aux[i / 2]); + } + for (int i = 0; i < static_cast(ctls.size()) / 2 - 1 - adjustment; + ++i) { + ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); + } + }; + const auto adjustForSingleControl = + [&](std::vector &ctls, + std::vector &aux) { + if (ctls.size() % 2 != 0) + ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3], + aux[ctls.size() - 2]); + }; + for (std::size_t i = aux.size(); i < controls.size() - 1; ++i) + aux.emplace_back(cudaq::QuditInfo(2, getAvailableIndex(2))); + + collectControls(controls, aux, 0); + adjustForSingleControl(controls, aux); + // Add to the instruction queue + decomposedInsts.emplace_back( + std::move(gateName), parameters, + std::vector{aux[controls.size() - 2]}, targets, op); + adjustForSingleControl(controls, aux); + collectControls(controls, aux, 0); + return decomposedInsts; + } + + /// @brief Allocate all requested `qudits`. + void flushRequestedAllocations() { + if (requestedAllocations.empty()) + return; + + allocateQudits(requestedAllocations); + requestedAllocations.clear(); + } + +protected: + void allocateQudit(const cudaq::QuditInfo &q) override { + requestedAllocations.emplace_back(2, q.id); + } + + void allocateQudits(const std::vector &qudits) override { + simulator()->allocateQubits(qudits.size()); + } + + void deallocateQudit(const cudaq::QuditInfo &q) override { + + // Before trying to deallocate, make sure the qudit hasn't + // been requested but not allocated. + auto iter = + std::find(requestedAllocations.begin(), requestedAllocations.end(), q); + if (iter != requestedAllocations.end()) { + requestedAllocations.erase(iter); + return; + } + + simulator()->deallocate(q.id); + } + + void deallocateQudits(const std::vector &qudits) override { + std::vector local; + for (auto &q : qudits) { + auto iter = std::find(requestedAllocations.begin(), + requestedAllocations.end(), q); + if (iter != requestedAllocations.end()) { + requestedAllocations.erase(iter); + } else { + local.push_back(q.id); + } + } + + simulator()->deallocateQubits(local); + } + + void handleExecutionContextChanged() override { + requestedAllocations.clear(); + simulator()->setExecutionContext(executionContext); + } + + void handleExecutionContextEnded() override { + simulator()->resetExecutionContext(); + } + + void executeInstruction(const Instruction &instruction) override { + flushRequestedAllocations(); + + // Get the data, create the Qubit* targets + auto [gateName, parameters, controls, targets, op] = instruction; + + // Map the Qudits to Qubits + std::vector localT; + std::transform(targets.begin(), targets.end(), std::back_inserter(localT), + [](auto &&el) { return el.id; }); + std::vector localC; + std::transform(controls.begin(), controls.end(), std::back_inserter(localC), + [](auto &&el) { return el.id; }); + + // Apply the gate + llvm::StringSwitch>(gateName) + .Case("h", [&]() { simulator()->h(localC, localT[0]); }) + .Case("x", [&]() { simulator()->x(localC, localT[0]); }) + .Case("y", [&]() { simulator()->y(localC, localT[0]); }) + .Case("z", [&]() { simulator()->z(localC, localT[0]); }) + .Case("rx", + [&]() { simulator()->rx(parameters[0], localC, localT[0]); }) + .Case("ry", + [&]() { simulator()->ry(parameters[0], localC, localT[0]); }) + .Case("rz", + [&]() { simulator()->rz(parameters[0], localC, localT[0]); }) + .Case("s", [&]() { simulator()->s(localC, localT[0]); }) + .Case("t", [&]() { simulator()->t(localC, localT[0]); }) + .Case("sdg", [&]() { simulator()->sdg(localC, localT[0]); }) + .Case("tdg", [&]() { simulator()->tdg(localC, localT[0]); }) + .Case("r1", + [&]() { simulator()->r1(parameters[0], localC, localT[0]); }) + .Case("u1", + [&]() { simulator()->u1(parameters[0], localC, localT[0]); }) + .Case("u3", + [&]() { + simulator()->u3(parameters[0], parameters[1], parameters[2], + localC, localT[0]); + }) + .Case("swap", + [&]() { simulator()->swap(localC, localT[0], localT[1]); }) + .Case("exp_pauli", + [&]() { + simulator()->applyExpPauli(parameters[0], localC, localT, op); + }) + .Default([&]() { + throw std::runtime_error("[DefaultExecutionManager] invalid gate " + "application requested " + + gateName + "."); + })(); + } + + int measureQudit(const cudaq::QuditInfo &q, + const std::string ®isterName) override { + flushRequestedAllocations(); + return simulator()->mz(q.id, registerName); + } + + void measureSpinOp(const cudaq::spin_op &op) override { + flushRequestedAllocations(); + simulator()->flushGateQueue(); + + if (executionContext->canHandleObserve) { + auto result = simulator()->observe(*executionContext->spin.value()); + executionContext->expectationValue = result.expectationValue; + executionContext->result = cudaq::sample_result(result); + return; + } + + assert(op.num_terms() == 1 && "Number of terms is not 1."); + + cudaq::info("Measure {}", op.to_string(false)); + std::vector qubitsToMeasure; + std::vector> basisChange; + op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) { + if (type != cudaq::pauli::I) + qubitsToMeasure.push_back(qubitIdx); + + if (type == cudaq::pauli::Y) + basisChange.emplace_back([&, qubitIdx](bool reverse) { + simulator()->rx(!reverse ? M_PI_2 : -M_PI_2, qubitIdx); + }); + else if (type == cudaq::pauli::X) + basisChange.emplace_back( + [&, qubitIdx](bool) { simulator()->h(qubitIdx); }); + }); + + // Change basis, flush the queue + if (!basisChange.empty()) { + for (auto &basis : basisChange) + basis(false); + + simulator()->flushGateQueue(); + } + + // Get whether this is shots-based + int shots = 0; + if (executionContext->shots > 0) + shots = executionContext->shots; + + // Sample and give the data to the context + cudaq::ExecutionResult result = simulator()->sample(qubitsToMeasure, shots); + executionContext->expectationValue = result.expectationValue; + executionContext->result = cudaq::sample_result(result); + + // Restore the state. + if (!basisChange.empty()) { + std::reverse(basisChange.begin(), basisChange.end()); + for (auto &basis : basisChange) + basis(true); + + simulator()->flushGateQueue(); + } + } + +public: + MpsExecutionManager() { + cudaq::info("[MpsExecutionManager] Creating the {} backend.", + simulator()->name()); + } + virtual ~MpsExecutionManager() = default; + + void resetQudit(const cudaq::QuditInfo &q) override { + flushRequestedAllocations(); + simulator()->resetQubit(q.id); + } + + void endAdjointRegion() override { + assert(!adjointQueueStack.empty() && "There must be at least one queue"); + + auto adjointQueue = std::move(adjointQueueStack.back()); + adjointQueueStack.pop_back(); + + // Select the queue to which these instructions will be added. + InstructionQueue *queue = adjointQueueStack.empty() + ? &instructionQueue + : &(adjointQueueStack.back()); + + std::reverse(adjointQueue.begin(), adjointQueue.end()); + for (auto &instruction : adjointQueue) { + const auto insts = decomposeMultiControlledInstruction( + instruction, auxQuditIdsForDeletion); + queue->insert(queue->end(), insts.begin(), insts.end()); + } + } + + /// The goal for apply is to create a new element of the + /// instruction queue (a tuple). + void apply(const std::string_view gateName, const std::vector ¶ms, + const std::vector &controls, + const std::vector &targets, + bool isAdjoint, cudaq::spin_op op) override { + + // Make a copy of the name that we can mutate if necessary + std::string mutable_name(gateName); + + // Make a copy of the parameters that we can mutate + std::vector mutable_params = params; + + // Create an array of controls, we will + // prepend any extra controls if in a control region + std::vector mutable_controls; + for (auto &e : extraControlIds) + mutable_controls.emplace_back(2, e); + + for (auto &e : controls) + mutable_controls.push_back(e); + + std::vector mutable_targets; + for (auto &t : targets) + mutable_targets.push_back(t); + // We need to check if we need take the adjoint of the operation. To do this + // we use a logical XOR between `isAdjoint` and whether the size of + // `adjointQueueStack` is even. The size of `adjointQueueStack` corresponds + // to the number of nested `cudaq::adjoint` calls. If the size is even, then + // we need to change the operation when `isAdjoint` is true. If the size is + // odd, then we need to change the operation when `isAdjoint` is false. + // (Adjoint modifiers cancel each other, e.g, `adj adj r1` is `r1`.) + // + // The cases: + // * not-adjoint, even number of `cudaq::adjoint` => _no_ need to change op + // * not-adjoint, odd number of `cudaq::adjoint` => change op + // * adjoint, even number of `cudaq::adjoint` => change op + // * adjoint, odd number `cudaq::adjoint` => _no_ need to change op + // + bool evenAdjointStack = (adjointQueueStack.size() % 2) == 0; + if (isAdjoint != !evenAdjointStack) { + for (std::size_t i = 0; i < params.size(); i++) + mutable_params[i] = -1.0 * params[i]; + if (gateName == "t") + mutable_name = "tdg"; + else if (gateName == "s") + mutable_name = "sdg"; + } + + if (!adjointQueueStack.empty()) { + // Add to the adjoint instruction queue + adjointQueueStack.back().emplace_back( + mutable_name, mutable_params, mutable_controls, mutable_targets, op); + return; + } + + const auto insts = decomposeMultiControlledInstruction( + {std::move(mutable_name), mutable_params, mutable_controls, + mutable_targets, op}, + auxQuditIdsForDeletion); + instructionQueue.insert(instructionQueue.end(), insts.begin(), insts.end()); + } + + void resetExecutionContext() override { + BasicExecutionManager::resetExecutionContext(); + + deallocateQudits(auxQuditIdsForDeletion); + for (auto &q : auxQuditIdsForDeletion) { + returnIndex(q.id); + } + auxQuditIdsForDeletion.clear(); + } +}; + +} // namespace + +CUDAQ_REGISTER_EXECUTION_MANAGER(MpsExecutionManager) \ No newline at end of file diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config index 2502f96e31..040ae0d8bf 100644 --- a/runtime/nvqir/cutensornet/tensornet-mps.config +++ b/runtime/nvqir/cutensornet/tensornet-mps.config @@ -10,3 +10,4 @@ NVQIR_SIMULATION_BACKEND="tensornet-mps" TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target" GPU_REQUIREMENTS="true" COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION" +LIBRARY_MODE_EXECUTION_MANAGER="mps" \ No newline at end of file From ec8020481abdea81d7cd276b9dc43c4e753f7e49 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 00:34:42 +0000 Subject: [PATCH 07/18] add decomp to simulator --- runtime/cudaq/qis/qubit_qis.h | 4 - runtime/nvqir/cutensornet/CMakeLists.txt | 18 - .../nvqir/cutensornet/MpsExecutionManager.cpp | 467 ------------------ .../cutensornet/simulator_mps_register.cpp | 219 +++++++- .../nvqir/cutensornet/tensornet-mps.config | 2 - runtime/nvqir/cutensornet/tensornet_state.cpp | 28 +- runtime/nvqir/cutensornet/tensornet_state.h | 9 +- unittests/integration/builder_tester.cpp | 15 - 8 files changed, 240 insertions(+), 522 deletions(-) delete mode 100644 runtime/nvqir/cutensornet/MpsExecutionManager.cpp diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index cb2f7caf83..9197720f48 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -636,7 +636,3 @@ std::vector slice_vector(std::vector &original, std::size_t start, } } // namespace cudaq - -#if defined(CUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION) -#include -#endif diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt index 9cb46fd27f..22d8e30e8a 100644 --- a/runtime/nvqir/cutensornet/CMakeLists.txt +++ b/runtime/nvqir/cutensornet/CMakeLists.txt @@ -90,24 +90,6 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3") target_link_libraries(nvqir-tensornet PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive) target_link_libraries(nvqir-tensornet-mps PRIVATE -Wl,--whole-archive ${CUDAQ_CUTENSORNET_PLUGIN_LIB} -Wl,--no-whole-archive) endif() - - add_library(cudaq-em-mps SHARED MpsExecutionManager.cpp) - set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-em-mps) - target_include_directories(cudaq-em-mps - PUBLIC - $ - $ - PRIVATE .) - - target_link_libraries(cudaq-em-mps - PUBLIC cudaq-spin PRIVATE nvqir cudaq-common fmt::fmt-header-only LLVMSupport) - - install(TARGETS cudaq-em-mps EXPORT cudaq-em-mps-targets DESTINATION lib) - - install(EXPORT cudaq-em-mps-targets - FILE CUDAQEmMpsTargets.cmake - NAMESPACE cudaq:: - DESTINATION lib/cmake/cudaq) else() message(WARNING "Skipped tensornet backend due to incompatible cutensornet version. Please install cutensornet v2.3.0+.") endif() diff --git a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp b/runtime/nvqir/cutensornet/MpsExecutionManager.cpp deleted file mode 100644 index 8245a0f131..0000000000 --- a/runtime/nvqir/cutensornet/MpsExecutionManager.cpp +++ /dev/null @@ -1,467 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#include "common/Logger.h" -#include "cudaq/qis/managers/BasicExecutionManager.h" -#include "nvqir/CircuitSimulator.h" -#include "llvm/ADT/StringSwitch.h" - -namespace nvqir { -CircuitSimulator *getCircuitSimulatorInternal(); -} -namespace { -class MpsExecutionManager : public cudaq::BasicExecutionManager { -private: - nvqir::CircuitSimulator *simulator() { - return nvqir::getCircuitSimulatorInternal(); - } - - /// @brief To improve `qudit` allocation, we defer - /// single `qudit` allocation requests until the first - /// encountered `apply` call. - std::vector requestedAllocations; - std::vector auxQuditIdsForDeletion; - - std::vector - decomposeMultiControlledInstruction(const Instruction &instruction, - std::vector &aux) { - // Get the data, create the Qubit* targets - auto [gateName, parameters, controls, targets, op] = instruction; - if (controls.size() + targets.size() <= 2) { - return {instruction}; - } - std::vector decomposedInsts; - - const auto makeInstruction = - [](const std::string gateName, const std::vector &gateParams, - const std::vector &ctrls, - const std::vector &targets) -> Instruction { - return std::make_tuple(gateName, gateParams, ctrls, targets, cudaq::spin_op{}); - }; - - if (targets.size() > 1) { - if (gateName == "swap") { - { - auto mutableCtrls = controls; - mutableCtrls.emplace_back(targets[0]); - const auto insts = decomposeMultiControlledInstruction( - makeInstruction("x", {}, mutableCtrls, {targets[1]}), - auxQuditIdsForDeletion); - decomposedInsts.insert(decomposedInsts.end(), insts.begin(), - insts.end()); - } - { - auto mutableCtrls = controls; - mutableCtrls.emplace_back(targets[1]); - const auto insts = decomposeMultiControlledInstruction( - makeInstruction("x", {}, mutableCtrls, {targets[0]}), - auxQuditIdsForDeletion); - decomposedInsts.insert(decomposedInsts.end(), insts.begin(), - insts.end()); - } - { - auto mutableCtrls = controls; - mutableCtrls.emplace_back(targets[0]); - const auto insts = decomposeMultiControlledInstruction( - makeInstruction("x", {}, mutableCtrls, {targets[1]}), - auxQuditIdsForDeletion); - decomposedInsts.insert(decomposedInsts.end(), insts.begin(), - insts.end()); - } - return decomposedInsts; - } else if (gateName == "exp_pauli") { - if (controls.size() <= 1) { - return {instruction}; - } else { - std::vector qubitSupport; - std::vector> basisChange; - op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) { - if (type != cudaq::pauli::I) - qubitSupport.push_back(targets[qubitIdx]); - - if (type == cudaq::pauli::Y) - basisChange.emplace_back([&, qubitIdx](bool reverse) { - decomposedInsts.emplace_back( - makeInstruction("rx", {!reverse ? M_PI_2 : -M_PI_2}, {}, - {targets[qubitIdx]})); - }); - else if (type == cudaq::pauli::X) - basisChange.emplace_back([&, qubitIdx](bool) { - decomposedInsts.emplace_back( - makeInstruction("h", {}, {}, {targets[qubitIdx]})); - }); - }); - - if (!basisChange.empty()) - for (auto &basis : basisChange) - basis(false); - - std::vector> toReverse; - for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) { - decomposedInsts.emplace_back(makeInstruction( - "x", {}, {qubitSupport[i]}, {qubitSupport[i + 1]})); - toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]); - } - - // Since this is a compute-action-uncompute type circuit, we only need - // to apply control on this rz gate. - { - const auto mcRzInsts = decomposeMultiControlledInstruction( - makeInstruction("rz", {-2.0 * parameters[0]}, controls, - {qubitSupport.back()}), - auxQuditIdsForDeletion); - decomposedInsts.insert(decomposedInsts.end(), mcRzInsts.begin(), - mcRzInsts.end()); - } - - std::reverse(toReverse.begin(), toReverse.end()); - for (auto &[i, j] : toReverse) - decomposedInsts.emplace_back(makeInstruction("x", {}, {i}, {j})); - - if (!basisChange.empty()) { - std::reverse(basisChange.begin(), basisChange.end()); - for (auto &basis : basisChange) - basis(true); - } - return decomposedInsts; - } - } else { - throw std::runtime_error("Unsupported: " + gateName); - } - } - - const auto ccnot = [&](cudaq::QuditInfo &a, cudaq::QuditInfo &b, - cudaq::QuditInfo &c) { - decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c})); - decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c})); - decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {b}, {c})); - decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {c})); - decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {b})); - decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("h", {}, {}, {c})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b})); - decomposedInsts.emplace_back(makeInstruction("t", {}, {}, {a})); - decomposedInsts.emplace_back(makeInstruction("tdg", {}, {}, {b})); - decomposedInsts.emplace_back(makeInstruction("x", {}, {a}, {b})); - }; - - const auto collectControls = [&](std::vector &ctls, - std::vector &aux, - int adjustment) { - for (int i = 0; i < static_cast(ctls.size()) - 1; i += 2) { - ccnot(ctls[i], ctls[i + 1], aux[i / 2]); - } - for (int i = 0; i < static_cast(ctls.size()) / 2 - 1 - adjustment; - ++i) { - ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); - } - }; - const auto adjustForSingleControl = - [&](std::vector &ctls, - std::vector &aux) { - if (ctls.size() % 2 != 0) - ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3], - aux[ctls.size() - 2]); - }; - for (std::size_t i = aux.size(); i < controls.size() - 1; ++i) - aux.emplace_back(cudaq::QuditInfo(2, getAvailableIndex(2))); - - collectControls(controls, aux, 0); - adjustForSingleControl(controls, aux); - // Add to the instruction queue - decomposedInsts.emplace_back( - std::move(gateName), parameters, - std::vector{aux[controls.size() - 2]}, targets, op); - adjustForSingleControl(controls, aux); - collectControls(controls, aux, 0); - return decomposedInsts; - } - - /// @brief Allocate all requested `qudits`. - void flushRequestedAllocations() { - if (requestedAllocations.empty()) - return; - - allocateQudits(requestedAllocations); - requestedAllocations.clear(); - } - -protected: - void allocateQudit(const cudaq::QuditInfo &q) override { - requestedAllocations.emplace_back(2, q.id); - } - - void allocateQudits(const std::vector &qudits) override { - simulator()->allocateQubits(qudits.size()); - } - - void deallocateQudit(const cudaq::QuditInfo &q) override { - - // Before trying to deallocate, make sure the qudit hasn't - // been requested but not allocated. - auto iter = - std::find(requestedAllocations.begin(), requestedAllocations.end(), q); - if (iter != requestedAllocations.end()) { - requestedAllocations.erase(iter); - return; - } - - simulator()->deallocate(q.id); - } - - void deallocateQudits(const std::vector &qudits) override { - std::vector local; - for (auto &q : qudits) { - auto iter = std::find(requestedAllocations.begin(), - requestedAllocations.end(), q); - if (iter != requestedAllocations.end()) { - requestedAllocations.erase(iter); - } else { - local.push_back(q.id); - } - } - - simulator()->deallocateQubits(local); - } - - void handleExecutionContextChanged() override { - requestedAllocations.clear(); - simulator()->setExecutionContext(executionContext); - } - - void handleExecutionContextEnded() override { - simulator()->resetExecutionContext(); - } - - void executeInstruction(const Instruction &instruction) override { - flushRequestedAllocations(); - - // Get the data, create the Qubit* targets - auto [gateName, parameters, controls, targets, op] = instruction; - - // Map the Qudits to Qubits - std::vector localT; - std::transform(targets.begin(), targets.end(), std::back_inserter(localT), - [](auto &&el) { return el.id; }); - std::vector localC; - std::transform(controls.begin(), controls.end(), std::back_inserter(localC), - [](auto &&el) { return el.id; }); - - // Apply the gate - llvm::StringSwitch>(gateName) - .Case("h", [&]() { simulator()->h(localC, localT[0]); }) - .Case("x", [&]() { simulator()->x(localC, localT[0]); }) - .Case("y", [&]() { simulator()->y(localC, localT[0]); }) - .Case("z", [&]() { simulator()->z(localC, localT[0]); }) - .Case("rx", - [&]() { simulator()->rx(parameters[0], localC, localT[0]); }) - .Case("ry", - [&]() { simulator()->ry(parameters[0], localC, localT[0]); }) - .Case("rz", - [&]() { simulator()->rz(parameters[0], localC, localT[0]); }) - .Case("s", [&]() { simulator()->s(localC, localT[0]); }) - .Case("t", [&]() { simulator()->t(localC, localT[0]); }) - .Case("sdg", [&]() { simulator()->sdg(localC, localT[0]); }) - .Case("tdg", [&]() { simulator()->tdg(localC, localT[0]); }) - .Case("r1", - [&]() { simulator()->r1(parameters[0], localC, localT[0]); }) - .Case("u1", - [&]() { simulator()->u1(parameters[0], localC, localT[0]); }) - .Case("u3", - [&]() { - simulator()->u3(parameters[0], parameters[1], parameters[2], - localC, localT[0]); - }) - .Case("swap", - [&]() { simulator()->swap(localC, localT[0], localT[1]); }) - .Case("exp_pauli", - [&]() { - simulator()->applyExpPauli(parameters[0], localC, localT, op); - }) - .Default([&]() { - throw std::runtime_error("[DefaultExecutionManager] invalid gate " - "application requested " + - gateName + "."); - })(); - } - - int measureQudit(const cudaq::QuditInfo &q, - const std::string ®isterName) override { - flushRequestedAllocations(); - return simulator()->mz(q.id, registerName); - } - - void measureSpinOp(const cudaq::spin_op &op) override { - flushRequestedAllocations(); - simulator()->flushGateQueue(); - - if (executionContext->canHandleObserve) { - auto result = simulator()->observe(*executionContext->spin.value()); - executionContext->expectationValue = result.expectationValue; - executionContext->result = cudaq::sample_result(result); - return; - } - - assert(op.num_terms() == 1 && "Number of terms is not 1."); - - cudaq::info("Measure {}", op.to_string(false)); - std::vector qubitsToMeasure; - std::vector> basisChange; - op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) { - if (type != cudaq::pauli::I) - qubitsToMeasure.push_back(qubitIdx); - - if (type == cudaq::pauli::Y) - basisChange.emplace_back([&, qubitIdx](bool reverse) { - simulator()->rx(!reverse ? M_PI_2 : -M_PI_2, qubitIdx); - }); - else if (type == cudaq::pauli::X) - basisChange.emplace_back( - [&, qubitIdx](bool) { simulator()->h(qubitIdx); }); - }); - - // Change basis, flush the queue - if (!basisChange.empty()) { - for (auto &basis : basisChange) - basis(false); - - simulator()->flushGateQueue(); - } - - // Get whether this is shots-based - int shots = 0; - if (executionContext->shots > 0) - shots = executionContext->shots; - - // Sample and give the data to the context - cudaq::ExecutionResult result = simulator()->sample(qubitsToMeasure, shots); - executionContext->expectationValue = result.expectationValue; - executionContext->result = cudaq::sample_result(result); - - // Restore the state. - if (!basisChange.empty()) { - std::reverse(basisChange.begin(), basisChange.end()); - for (auto &basis : basisChange) - basis(true); - - simulator()->flushGateQueue(); - } - } - -public: - MpsExecutionManager() { - cudaq::info("[MpsExecutionManager] Creating the {} backend.", - simulator()->name()); - } - virtual ~MpsExecutionManager() = default; - - void resetQudit(const cudaq::QuditInfo &q) override { - flushRequestedAllocations(); - simulator()->resetQubit(q.id); - } - - void endAdjointRegion() override { - assert(!adjointQueueStack.empty() && "There must be at least one queue"); - - auto adjointQueue = std::move(adjointQueueStack.back()); - adjointQueueStack.pop_back(); - - // Select the queue to which these instructions will be added. - InstructionQueue *queue = adjointQueueStack.empty() - ? &instructionQueue - : &(adjointQueueStack.back()); - - std::reverse(adjointQueue.begin(), adjointQueue.end()); - for (auto &instruction : adjointQueue) { - const auto insts = decomposeMultiControlledInstruction( - instruction, auxQuditIdsForDeletion); - queue->insert(queue->end(), insts.begin(), insts.end()); - } - } - - /// The goal for apply is to create a new element of the - /// instruction queue (a tuple). - void apply(const std::string_view gateName, const std::vector ¶ms, - const std::vector &controls, - const std::vector &targets, - bool isAdjoint, cudaq::spin_op op) override { - - // Make a copy of the name that we can mutate if necessary - std::string mutable_name(gateName); - - // Make a copy of the parameters that we can mutate - std::vector mutable_params = params; - - // Create an array of controls, we will - // prepend any extra controls if in a control region - std::vector mutable_controls; - for (auto &e : extraControlIds) - mutable_controls.emplace_back(2, e); - - for (auto &e : controls) - mutable_controls.push_back(e); - - std::vector mutable_targets; - for (auto &t : targets) - mutable_targets.push_back(t); - // We need to check if we need take the adjoint of the operation. To do this - // we use a logical XOR between `isAdjoint` and whether the size of - // `adjointQueueStack` is even. The size of `adjointQueueStack` corresponds - // to the number of nested `cudaq::adjoint` calls. If the size is even, then - // we need to change the operation when `isAdjoint` is true. If the size is - // odd, then we need to change the operation when `isAdjoint` is false. - // (Adjoint modifiers cancel each other, e.g, `adj adj r1` is `r1`.) - // - // The cases: - // * not-adjoint, even number of `cudaq::adjoint` => _no_ need to change op - // * not-adjoint, odd number of `cudaq::adjoint` => change op - // * adjoint, even number of `cudaq::adjoint` => change op - // * adjoint, odd number `cudaq::adjoint` => _no_ need to change op - // - bool evenAdjointStack = (adjointQueueStack.size() % 2) == 0; - if (isAdjoint != !evenAdjointStack) { - for (std::size_t i = 0; i < params.size(); i++) - mutable_params[i] = -1.0 * params[i]; - if (gateName == "t") - mutable_name = "tdg"; - else if (gateName == "s") - mutable_name = "sdg"; - } - - if (!adjointQueueStack.empty()) { - // Add to the adjoint instruction queue - adjointQueueStack.back().emplace_back( - mutable_name, mutable_params, mutable_controls, mutable_targets, op); - return; - } - - const auto insts = decomposeMultiControlledInstruction( - {std::move(mutable_name), mutable_params, mutable_controls, - mutable_targets, op}, - auxQuditIdsForDeletion); - instructionQueue.insert(instructionQueue.end(), insts.begin(), insts.end()); - } - - void resetExecutionContext() override { - BasicExecutionManager::resetExecutionContext(); - - deallocateQudits(auxQuditIdsForDeletion); - for (auto &q : auxQuditIdsForDeletion) { - returnIndex(q.id); - } - auxQuditIdsForDeletion.clear(); - } -}; - -} // namespace - -CUDAQ_REGISTER_EXECUTION_MANAGER(MpsExecutionManager) \ No newline at end of file diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp index b849b7a119..ad53cdfb3c 100644 --- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp @@ -18,7 +18,7 @@ class SimulatorMPS : public SimulatorTensorNetBase { // Default relative cutoff double m_relCutoff = 1e-5; std::vector m_mpsTensors_d; - + std::vector m_auxQubitsForGateDecomp; public: SimulatorMPS() : SimulatorTensorNetBase() { if (auto *maxBondEnvVar = std::getenv("CUDAQ_MPS_MAX_BOND")) { @@ -108,6 +108,223 @@ class SimulatorMPS : public SimulatorTensorNetBase { } m_mpsTensors_d.clear(); } + + void resetExecutionContext() override { + SimulatorTensorNetBase::resetExecutionContext(); + m_auxQubitsForGateDecomp.clear(); + } + + /// @brief Return the state vector data + cudaq::State getStateData() override { + LOG_API_TIME(); + if (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size() > 64) + throw std::runtime_error("State vector data is too large."); + // Handle empty state (e.g., no qubit allocation) + if (!m_state) + return cudaq::State{{0}, {}}; + const uint64_t svDim = + (1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size())); + const std::vector projectedModes(m_auxQubitsForGateDecomp.begin(), + m_auxQubitsForGateDecomp.end()); + return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)}; + } + + size_t addAuxQubit() { + if (m_state->isDirty()) + throw std::runtime_error( + "[MPS Simulator] Unable to perform multi-control gate decomposition " + "due to dynamical circuits."); + m_state = std::make_unique(m_state->getNumQubits() + 1, + m_cutnHandle); + return m_state->getNumQubits() - 1; + } + + template + void + decomposeMultiControlledInstruction(const std::vector ¶ms, + const std::vector &controls, + const std::vector &targets) { + if (controls.size() <= 1) { + enqueueQuantumOperation(params, controls, targets); + return; + } + + const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) { + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {b}, {c}); + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {a}, {c}); + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {b}, {c}); + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {a}, {c}); + enqueueQuantumOperation>({}, {}, {b}); + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {}, {c}); + enqueueQuantumOperation>({}, {a}, {b}); + enqueueQuantumOperation>({}, {}, {a}); + enqueueQuantumOperation>({}, {}, {b}); + enqueueQuantumOperation>({}, {a}, {b}); + }; + + const auto collectControls = [&](const std::vector &ctls, + const std::vector &aux, + int adjustment) { + for (int i = 0; i < static_cast(ctls.size()) - 1; i += 2) { + ccnot(ctls[i], ctls[i + 1], aux[i / 2]); + } + for (int i = 0; i < static_cast(ctls.size()) / 2 - 1 - adjustment; + ++i) { + ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); + } + }; + const auto adjustForSingleControl = + [&](const std::vector &ctls, + const std::vector &aux) { + if (ctls.size() % 2 != 0) + ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3], + aux[ctls.size() - 2]); + }; + + std::vector aux; + for (std::size_t i = 0; i < controls.size() - 1; ++i) { + const auto auxQubit = addAuxQubit(); + m_auxQubitsForGateDecomp.emplace_back(auxQubit); + aux.emplace_back(auxQubit); + } + + collectControls(controls, aux, 0); + adjustForSingleControl(controls, aux); + + // Add to the singly-controlled instruction queue + enqueueQuantumOperation(params, {aux[controls.size() - 2]}, targets); + + adjustForSingleControl(controls, aux); + collectControls(controls, aux, 0); + } + +#define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME) \ + using CircuitSimulator::NAME; \ + void NAME(const std::vector &controls, \ + const std::size_t qubitIdx) override { \ + decomposeMultiControlledInstruction>( \ + {}, controls, std::vector{qubitIdx}); \ + } + +#define CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(NAME) \ + using CircuitSimulator::NAME; \ + void NAME(const double angle, const std::vector &controls, \ + const std::size_t qubitIdx) override { \ + decomposeMultiControlledInstruction>( \ + {angle}, controls, std::vector{qubitIdx}); \ + } + + /// @brief The X gate + CIRCUIT_SIMULATOR_ONE_QUBIT(x) + /// @brief The Y gate + CIRCUIT_SIMULATOR_ONE_QUBIT(y) + /// @brief The Z gate + CIRCUIT_SIMULATOR_ONE_QUBIT(z) + /// @brief The H gate + CIRCUIT_SIMULATOR_ONE_QUBIT(h) + /// @brief The S gate + CIRCUIT_SIMULATOR_ONE_QUBIT(s) + /// @brief The T gate + CIRCUIT_SIMULATOR_ONE_QUBIT(t) + /// @brief The Sdg gate + CIRCUIT_SIMULATOR_ONE_QUBIT(sdg) + /// @brief The Tdg gate + CIRCUIT_SIMULATOR_ONE_QUBIT(tdg) + /// @brief The RX gate + CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rx) + /// @brief The RY gate + CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(ry) + /// @brief The RZ gate + CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rz) + /// @brief The Phase gate + CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(r1) +// Undef those preprocessor defines. +#undef CIRCUIT_SIMULATOR_ONE_QUBIT +#undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM + + using CircuitSimulator::swap; + void swap(const std::vector &ctrlBits, const std::size_t srcIdx, + const std::size_t tgtIdx) override { + if (ctrlBits.empty()) + return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx); + { + std::vector ctls = ctrlBits; + ctls.emplace_back(tgtIdx); + decomposeMultiControlledInstruction>({}, ctls, {srcIdx}); + } + { + std::vector ctls = ctrlBits; + ctls.emplace_back(srcIdx); + decomposeMultiControlledInstruction>({}, ctls, {tgtIdx}); + } + { + std::vector ctls = ctrlBits; + ctls.emplace_back(tgtIdx); + decomposeMultiControlledInstruction>({}, ctls, {srcIdx}); + } + } + + void applyExpPauli(double theta, + const std::vector &controls, + const std::vector &qubitIds, + const cudaq::spin_op &op) override { + if (op.is_identity()) { + if (controls.empty()) { + // exp(i*theta*Id) is noop if this is not a controlled gate. + return; + } else { + // Throw an error if this exp_pauli(i*theta*Id) becomes a non-trivial + // gate due to control qubits. + // FIXME: revisit this once + // https://github.com/NVIDIA/cuda-quantum/issues/483 is implemented. + throw std::logic_error("Applying controlled global phase via exp_pauli " + "of identity operator is not supported"); + } + } + std::vector qubitSupport; + std::vector> basisChange; + op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) { + if (type != cudaq::pauli::I) + qubitSupport.push_back(qubitIds[qubitIdx]); + + if (type == cudaq::pauli::Y) + basisChange.emplace_back([&, qubitIdx](bool reverse) { + rx(!reverse ? M_PI_2 : -M_PI_2, qubitIds[qubitIdx]); + }); + else if (type == cudaq::pauli::X) + basisChange.emplace_back( + [&, qubitIdx](bool) { h(qubitIds[qubitIdx]); }); + }); + + if (!basisChange.empty()) + for (auto &basis : basisChange) + basis(false); + + std::vector> toReverse; + for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) { + x({qubitSupport[i]}, qubitSupport[i + 1]); + toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]); + } + + // Perform multi-control decomposition. + decomposeMultiControlledInstruction>( + {-2.0 * theta}, controls, {qubitSupport.back()}); + + std::reverse(toReverse.begin(), toReverse.end()); + for (auto &[i, j] : toReverse) + x({i}, j); + + if (!basisChange.empty()) { + std::reverse(basisChange.begin(), basisChange.end()); + for (auto &basis : basisChange) + basis(true); + } + } }; } // end namespace nvqir diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config index 040ae0d8bf..27ac2279b2 100644 --- a/runtime/nvqir/cutensornet/tensornet-mps.config +++ b/runtime/nvqir/cutensornet/tensornet-mps.config @@ -9,5 +9,3 @@ NVQIR_SIMULATION_BACKEND="tensornet-mps" TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target" GPU_REQUIREMENTS="true" -COMPILER_FLAGS="$COMPILER_FLAGS -DCUDAQ_ENABLE_MULTI_CONTROL_DECOMPOSITION" -LIBRARY_MODE_EXECUTION_MANAGER="mps" \ No newline at end of file diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp index d869801a46..ecf455e819 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.cpp +++ b/runtime/nvqir/cutensornet/tensornet_state.cpp @@ -23,19 +23,17 @@ TensorNetState::TensorNetState(std::size_t numQubits, void TensorNetState::applyGate(const std::vector &qubitIds, void *gateDeviceMem, bool adjoint) { - int64_t id = 0; HANDLE_CUTN_ERROR(cutensornetStateApplyTensor( m_cutnHandle, m_quantumState, qubitIds.size(), qubitIds.data(), gateDeviceMem, nullptr, /*immutable*/ 1, - /*adjoint*/ static_cast(adjoint), /*unitary*/ 1, &id)); + /*adjoint*/ static_cast(adjoint), /*unitary*/ 1, &m_tensorId)); } void TensorNetState::applyQubitProjector(void *proj_d, int32_t qubitIdx) { - int64_t id = 0; HANDLE_CUTN_ERROR( cutensornetStateApplyTensor(m_cutnHandle, m_quantumState, 1, &qubitIdx, proj_d, nullptr, /*immutable*/ 1, - /*adjoint*/ 0, /*unitary*/ 0, &id)); + /*adjoint*/ 0, /*unitary*/ 0, &m_tensorId)); } std::unordered_map @@ -120,24 +118,26 @@ TensorNetState::sample(const std::vector &measuredBitIds, return counts; } -std::vector> TensorNetState::getStateVector() { +std::vector> +TensorNetState::getStateVector(const std::vector &projectedModes) { // Make sure that we don't overflow the memory size calculation. // Note: the actual limitation will depend on the system memory. - if (m_numQubits > 64 || - (1ull << m_numQubits) > + if ((m_numQubits - projectedModes.size()) > 64 || + (1ull << (m_numQubits - projectedModes.size())) > std::numeric_limits::max() / sizeof(std::complex)) throw std::runtime_error( "Too many qubits are requested for full state vector contraction."); LOG_API_TIME(); void *d_sv{nullptr}; - const uint64_t svDim = 1ull << m_numQubits; + const uint64_t svDim = 1ull << (m_numQubits - projectedModes.size()); HANDLE_CUDA_ERROR(cudaMalloc(&d_sv, svDim * sizeof(std::complex))); ScratchDeviceMem scratchPad; // Create the quantum state amplitudes accessor cutensornetStateAccessor_t accessor; - HANDLE_CUTN_ERROR(cutensornetCreateAccessor(m_cutnHandle, m_quantumState, 0, - nullptr, nullptr, &accessor)); + HANDLE_CUTN_ERROR(cutensornetCreateAccessor( + m_cutnHandle, m_quantumState, projectedModes.size(), + projectedModes.data(), nullptr, &accessor)); const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network @@ -167,9 +167,11 @@ std::vector> TensorNetState::getStateVector() { // Compute the quantum state amplitudes std::complex stateNorm{0.0, 0.0}; - HANDLE_CUTN_ERROR( - cutensornetAccessorCompute(m_cutnHandle, accessor, nullptr, workDesc, - d_sv, static_cast(&stateNorm), 0)); + // All projected modes are assumed to be projected to 0. + std::vector projectedModeValues(projectedModes.size(), 0); + HANDLE_CUTN_ERROR(cutensornetAccessorCompute( + m_cutnHandle, accessor, projectedModeValues.data(), workDesc, d_sv, + static_cast(&stateNorm), 0)); std::vector> h_sv(svDim); HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv, svDim * sizeof(std::complex), diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h index bd38659d1c..7416b3807b 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.h +++ b/runtime/nvqir/cutensornet/tensornet_state.h @@ -19,6 +19,7 @@ class TensorNetState { std::size_t m_numQubits; cutensornetHandle_t m_cutnHandle; cutensornetState_t m_quantumState; + int64_t m_tensorId = -1; public: /// @brief Constructor @@ -45,7 +46,8 @@ class TensorNetState { /// @brief Contract the tensor network representation to retrieve the state /// vector. - std::vector> getStateVector(); + std::vector> + getStateVector(const std::vector &projectedModes = {}); /// @brief Compute the reduce density matrix on a set of qubits /// @@ -73,7 +75,10 @@ class TensorNetState { /// @brief Number of qubits that this state represents. std::size_t getNumQubits() const { return m_numQubits; } - + + /// @brief True if the state contains gate tensors (not just initial qubit + /// tensors) + bool isDirty() const { return m_tensorId > 0; } /// @brief Destructor ~TensorNetState(); }; diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp index 7d27635ef8..abdac9b870 100644 --- a/unittests/integration/builder_tester.cpp +++ b/unittests/integration/builder_tester.cpp @@ -125,7 +125,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) { EXPECT_EQ(counter, 1000); } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS // MPS doesn't support gates on more than 2 qubits { auto ccnot_builder = cudaq::make_kernel(); @@ -139,7 +138,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) { counts.dump(); EXPECT_TRUE(counts.begin()->first == "101"); } -#endif { // Check controlled parametric gates (constant angle) @@ -221,8 +219,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) { } } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(BuilderTester, checkRotations) { // rx: entire qvector @@ -427,11 +423,7 @@ CUDAQ_TEST(BuilderTester, checkRotations) { EXPECT_EQ(counts.count("0111"), 1000); } } -#endif -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// Skip, else fails with error - '"MPS simulator: Gates on 3 or more qubits are -// unsupported. Encountered: swap[0][1,2]" thrown in the test body.' CUDAQ_TEST(BuilderTester, checkSwap) { cudaq::set_random_seed(13); @@ -575,7 +567,6 @@ CUDAQ_TEST(BuilderTester, checkSwap) { EXPECT_NEAR(counts.count(want_state), 1000, 0); } } -#endif // Conditional execution on the tensornet backend is slow for a large number of // shots. @@ -708,8 +699,6 @@ CUDAQ_TEST(BuilderTester, checkIsArgStdVec) { EXPECT_FALSE(kernel.isArgStdVec(1)); } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(BuilderTester, checkKernelControl) { cudaq::set_random_seed(13); @@ -766,7 +755,6 @@ CUDAQ_TEST(BuilderTester, checkKernelControl) { EXPECT_EQ(1, counts.size()); EXPECT_TRUE(counts.begin()->first == "101"); } -#endif CUDAQ_TEST(BuilderTester, checkAdjointOp) { auto kernel = cudaq::make_kernel(); @@ -1101,8 +1089,6 @@ CUDAQ_TEST(BuilderTester, checkExpPauli) { } } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits CUDAQ_TEST(BuilderTester, checkControlledRotations) { // rx: pi { @@ -1217,7 +1203,6 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) { EXPECT_EQ(counts.count("11111111"), 1000); } } -#endif #ifndef CUDAQ_BACKEND_DM From a9a4fcf012bb440368420368e4e899f6cb66dc3d Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 00:44:14 +0000 Subject: [PATCH 08/18] Remove the remaining bypassed tests --- unittests/CMakeLists.txt | 2 +- unittests/qir/NVQIRTester.cpp | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index b9a01b26d1..50b976f337 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -80,7 +80,7 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER) set(TEST_LABELS "gpu_required") endif() if (${NVQIR_BACKEND} STREQUAL "tensornet-mps") - target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_BACKEND_TENSORNET_MPS) + target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET) set(TEST_LABELS "gpu_required") endif() if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32") diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp index f848786f2e..f287877573 100644 --- a/unittests/qir/NVQIRTester.cpp +++ b/unittests/qir/NVQIRTester.cpp @@ -113,8 +113,6 @@ CUDAQ_TEST(NVQIRTester, checkSimple) { __quantum__rt__finalize(); } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits (controlled swap) CUDAQ_TEST(NVQIRTester, checkQuantumIntrinsics) { __quantum__rt__initialize(0, nullptr); auto qubits = __quantum__rt__qubit_allocate_array(3); @@ -153,7 +151,6 @@ CUDAQ_TEST(NVQIRTester, checkQuantumIntrinsics) { __quantum__rt__qubit_release_array(qubits); __quantum__rt__finalize(); } -#endif CUDAQ_TEST(NVQIRTester, checkReset) { __quantum__rt__initialize(0, nullptr); @@ -175,8 +172,6 @@ CUDAQ_TEST(NVQIRTester, checkReset) { __quantum__rt__finalize(); } -#ifndef CUDAQ_BACKEND_TENSORNET_MPS -// MPS doesn't support gates on more than 2 qubits (controlled swap) // SWAP with a single ctrl qubit in 0 state. CUDAQ_TEST(NVQIRTester, checkSWAP) { // Simple SWAP. @@ -261,7 +256,6 @@ CUDAQ_TEST(NVQIRTester, checkSWAP) { __quantum__rt__finalize(); } } -#endif CUDAQ_TEST(NVQIRTester, checkQubitReset) { // Initialize two qubits in the 0-state. From ea7b38901177bf2cada5ea3470e5f6fa08f861aa Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 00:49:45 +0000 Subject: [PATCH 09/18] Clean up debug changes --- multi_ctrls.cpp | 96 -------- .../kernels/decomposition/controlled_gates.h | 218 ------------------ runtime/cudaq/utils/cudaq_utils.h | 7 - runtime/nvqir/cutensornet/CMakeLists.txt | 4 +- .../nvqir/cutensornet/tensornet-mps.config | 11 - runtime/nvqir/cutensornet/tensornet.config | 11 - unittests/integration/gate_library_tester.cpp | 1 - 7 files changed, 2 insertions(+), 346 deletions(-) delete mode 100644 multi_ctrls.cpp delete mode 100644 runtime/cudaq/kernels/decomposition/controlled_gates.h delete mode 100644 runtime/nvqir/cutensornet/tensornet-mps.config delete mode 100644 runtime/nvqir/cutensornet/tensornet.config diff --git a/multi_ctrls.cpp b/multi_ctrls.cpp deleted file mode 100644 index 3e83558ece..0000000000 --- a/multi_ctrls.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include -namespace cudaq { - -void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ { - h(c); - cx(b, c); - t(c); - cx(a, c); - t(c); - cx(b, c); - t(c); - cx(a, c); - t(b); - t(c); - h(c); - cx(a, b); - t(a); - t(b); - cx(a, b); -} - -void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux, - int adjustment) __qpu__ { - for (int i = 0; i < ctls.size() - 1; i += 2) { - CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); - } - for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { - CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); - } -} - -void CollectControls( - const std::vector> &ctls, - cudaq::qview<> aux, int adjustment) __qpu__ { - for (int i = 0; i < ctls.size() - 1; i += 2) { - CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); - } - for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { - CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); - } -} - -void AdjustForSingleControl(cudaq::qview<> ctls, cudaq::qview<> aux) __qpu__ { - if (ctls.size() % 2 != 0) - CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]); -} - -template -decltype(auto) getParameterPackVals(T &&...Args) noexcept { - return std::get(std::forward_as_tuple(std::forward(Args)...)); -} - -template -void x(cudaq::qubit& c0, cudaq::qubit& c1, QubitTy &...qubits) __qpu__ { - static_assert(std::is_same_v); - static constexpr std::size_t qubitCount = sizeof...(qubits) + 2; - static constexpr std::size_t numCtrls = qubitCount - 1; - static_assert(numCtrls > 1); - if constexpr (numCtrls == 2) { - CCNOT(c0, - c1, - getParameterPackVals<0>(qubits...)); - } else { - cudaq::qvector aux(numCtrls - 2); - std::vector> ctls{{qubits...}}; - ctls.pop_back(); - ctls.emplace_back(c1); - ctls.emplace_back(c0); - assert(ctls.size() == numCtrls); - cudaq::compute_action( - [&]() { CollectControls(ctls, aux, 1 - (ctls.size() % 2)); }, - [&]() { - if (ctls.size() % 2 != 0) { - CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], getParameterPackVals(qubits...)); - } else { - CCNOT(aux[ctls.size() - 3], aux[ctls.size() - 4], getParameterPackVals(qubits...)); - } - }); - } -} -} // namespace cudaq - -int main() { - - auto kernel = []() __qpu__ { - cudaq::qarray<5> q; - x(q); - x(q[0], q[1], q[2], q[3], q[4]); - mz(q); - }; - - auto counts = cudaq::sample(kernel); - counts.dump(); - - return 0; -} \ No newline at end of file diff --git a/runtime/cudaq/kernels/decomposition/controlled_gates.h b/runtime/cudaq/kernels/decomposition/controlled_gates.h deleted file mode 100644 index 254eb0822a..0000000000 --- a/runtime/cudaq/kernels/decomposition/controlled_gates.h +++ /dev/null @@ -1,218 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include -#include - -namespace cudaq { -namespace internal { - -/// Applies the doubly controlled–NOT (CCNOT) gate to three qubits via -/// decomposition. -void CCNOT(qubit &a, qubit &b, qubit &c) __qpu__ { - h(c); - cx(b, c); - t(c); - cx(a, c); - t(c); - cx(b, c); - t(c); - cx(a, c); - t(b); - t(c); - h(c); - cx(a, b); - t(a); - t(b); - cx(a, b); -} - -/// Collects the given list of control qubits into one or two of the given -/// auxiliary qubits, using all but the last qubits in the auxiliary list as -/// scratch qubits. -/// For example, if the controls list is 6 qubits, the auxiliary list must be 5 -/// qubits, and the state from the 6 control qubits will be collected into the -/// last qubit of the auxiliary array. -/// The adjustment is used to allow the caller to reduce or increase -/// the number of times this is run based on the eventual number of control -/// qubits needed. -void CollectControls( - const std::vector> &ctls, - cudaq::qview<> aux, int adjustment) __qpu__ { - for (int i = 0; i < ctls.size() - 1; i += 2) { - CCNOT(ctls[i], ctls[i + 1], aux[i / 2]); - } - for (int i = 0; i < ctls.size() / 2 - 1 - adjustment; ++i) { - CCNOT(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); - } -} - -void CollectControls(cudaq::qview<> ctls, cudaq::qview<> aux, - int adjustment) __qpu__ { - std::vector> castedCtls; - for (auto &q : ctls) - castedCtls.emplace_back(q); - - CollectControls(castedCtls, aux, adjustment); -} - -/// When collecting controls, if there is an uneven number of original control -/// qubits then the last control and the second to last auxiliary will be -/// collected into the last auxiliary. -void AdjustForSingleControl( - const std::vector> &ctls, - cudaq::qview<> aux) __qpu__ { - if (ctls.size() % 2 != 0) - CCNOT(ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]); -} - -template -void x(const std::vector> &ctrls, - cudaq::qubit &target) __qpu__ { - static_assert(std::is_same_v); - const std::size_t numCtrls = ctrls.size(); - if (numCtrls == 0) { - x(target); - } else if (numCtrls == 1) { - cx(ctrls[0].get(), target); - } else if (numCtrls == 2) { - CCNOT(ctrls[0], ctrls[1], target); - } else { - cudaq::qvector aux(numCtrls - 2); - cudaq::compute_action( - [&]() { CollectControls(ctrls, aux, 1 - (ctrls.size() % 2)); }, - [&]() { - if (ctrls.size() % 2 != 0) { - CCNOT(ctrls[ctrls.size() - 1], aux[ctrls.size() - 3], target); - } else { - CCNOT(aux[ctrls.size() - 3], aux[ctrls.size() - 4], target); - } - }); - } -} - -#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(NAME) \ - template \ - void NAME(const std::vector> &ctrls, \ - cudaq::qubit &target) __qpu__ { \ - static_assert(std::is_same_v); \ - const std::size_t numCtrls = ctrls.size(); \ - if (numCtrls == 0) { \ - NAME(target); \ - } else if (numCtrls == 1) { \ - NAME(ctrls[0].get(), target); \ - } else { \ - cudaq::qvector aux(numCtrls - 1); \ - cudaq::compute_action( \ - [&]() { \ - CollectControls(ctrls, aux, 0); \ - AdjustForSingleControl(ctrls, aux); \ - }, \ - [&]() { NAME(aux[ctrls.size() - 2], target); }); \ - } \ - } - -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(h) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(y) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(z) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(t) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL_IMPL(s) - -#define CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(NAME) \ - template \ - void NAME(ScalarAngle angle, \ - const std::vector> &ctrls, \ - cudaq::qubit &target) __qpu__ { \ - static_assert(std::is_same_v); \ - const std::size_t numCtrls = ctrls.size(); \ - if (numCtrls == 0) { \ - NAME(angle, target); \ - } else if (numCtrls == 1) { \ - oneQubitSingleParameterApply( \ - angle, ctrls[0].get(), target); \ - } else { \ - cudaq::qvector aux(numCtrls - 1); \ - cudaq::compute_action( \ - [&]() { \ - CollectControls(ctrls, aux, 0); \ - AdjustForSingleControl(ctrls, aux); \ - }, \ - [&]() { \ - oneQubitSingleParameterApply( \ - angle, aux[ctrls.size() - 2], target); \ - }); \ - } \ - } - -CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rx) -CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(ry) -CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(rz) -CUDAQ_ONE_TARGET_QUBIT_ONE_PARAM_MULTI_CONTROL_IMPL(r1) -} // namespace internal - -#define CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(NAME) \ - template \ - void NAME(cudaq::qubit &c0, cudaq::qubit &c1, QubitTy &...qubits) __qpu__ { \ - static_assert(std::is_same_v); \ - std::vector> ctls{{qubits...}}; \ - /* Last qubit is the target */ \ - ctls.pop_back(); \ - /*Add the two explicit qubits */ \ - ctls.emplace_back(c1); \ - ctls.emplace_back(c0); \ - internal::NAME( \ - ctls, cudaq::getParameterPackVals(qubits...)); \ - } \ - template \ - void NAME(cudaq::qview<> ctrls, cudaq::qubit &target) __qpu__ { \ - static_assert(std::is_same_v); \ - std::vector> castedCtls; \ - for (auto &q : ctrls) \ - castedCtls.emplace_back(q); \ - internal::NAME(castedCtls, target); \ - } - -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(h) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(x) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(y) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(z) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(t) -CUDAQ_ONE_TARGET_QUBIT_MULTI_CONTROL(s) - -#define CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(NAME) \ - template \ - void NAME(ScalarAngle angle, cudaq::qubit &c0, cudaq::qubit &c1, \ - QubitTy &...qubits) __qpu__ { \ - static_assert(std::is_same_v); \ - static_assert(sizeof...(qubits) > 0); \ - std::vector> ctls{{qubits...}}; \ - /* Last qubit is the target */ \ - cudaq::qubit &target = ctls.back(); \ - ctls.pop_back(); \ - /*Add the two explicit qubits */ \ - ctls.emplace_back(c1); \ - ctls.emplace_back(c0); \ - internal::NAME(angle, ctls, target); \ - } \ - template \ - void NAME(ScalarAngle angle, cudaq::qview<> ctrls, cudaq::qubit &target) \ - __qpu__ { \ - static_assert(std::is_same_v); \ - std::vector> castedCtls; \ - for (auto &q : ctrls) \ - castedCtls.emplace_back(q); \ - internal::NAME(angle, castedCtls, target); \ - } - -CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rx) -CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(ry) -CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(rz) -CUDAQ_ONE_TARGET_ONE_PARAM_QUBIT_MULTI_CONTROL(r1) -} // namespace cudaq diff --git a/runtime/cudaq/utils/cudaq_utils.h b/runtime/cudaq/utils/cudaq_utils.h index 16d7d824d4..9e3280a232 100644 --- a/runtime/cudaq/utils/cudaq_utils.h +++ b/runtime/cudaq/utils/cudaq_utils.h @@ -188,13 +188,6 @@ void tuple_for_each_with_idx(TupleType &&t, FunctionType f) { std::integral_constant()); } -// Utility function to access a parameter at index from a variadic parameter -// pack. -template -decltype(auto) getParameterPackVals(T &&...Args) noexcept { - return std::get(std::forward_as_tuple(std::forward(Args)...)); -} - // Function check if file with given path+name exists inline bool fileExists(const std::string &name) { if (FILE *file = fopen(name.c_str(), "r")) { diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt index 22d8e30e8a..23f10564af 100644 --- a/runtime/nvqir/cutensornet/CMakeLists.txt +++ b/runtime/nvqir/cutensornet/CMakeLists.txt @@ -69,12 +69,12 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3") target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR}) target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart) install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib) + file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n") + install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets) endmacro() nvqir_create_cutn_plugin(tensornet ${BASE_TENSOR_BACKEND_SRS} simulator_tensornet_register.cpp ) nvqir_create_cutn_plugin(tensornet-mps ${BASE_TENSOR_BACKEND_SRS} simulator_mps_register.cpp) - add_target_config(tensornet) - add_target_config(tensornet-mps) add_library(tensornet-mpi-util OBJECT mpi_support.cpp) target_include_directories(tensornet-mpi-util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/runtime) target_link_libraries(tensornet-mpi-util PRIVATE cudaq-common fmt::fmt-header-only) diff --git a/runtime/nvqir/cutensornet/tensornet-mps.config b/runtime/nvqir/cutensornet/tensornet-mps.config deleted file mode 100644 index 27ac2279b2..0000000000 --- a/runtime/nvqir/cutensornet/tensornet-mps.config +++ /dev/null @@ -1,11 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -NVQIR_SIMULATION_BACKEND="tensornet-mps" -TARGET_DESCRIPTION="cuTensorNet-based Matrix Product State (MPS) backend target" -GPU_REQUIREMENTS="true" diff --git a/runtime/nvqir/cutensornet/tensornet.config b/runtime/nvqir/cutensornet/tensornet.config deleted file mode 100644 index 9f47ba1c88..0000000000 --- a/runtime/nvqir/cutensornet/tensornet.config +++ /dev/null @@ -1,11 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -NVQIR_SIMULATION_BACKEND="tensornet" -TARGET_DESCRIPTION="cuTensorNet-based full tensor network contraction backend target" -GPU_REQUIREMENTS="true" diff --git a/unittests/integration/gate_library_tester.cpp b/unittests/integration/gate_library_tester.cpp index 3d976199f5..af531e936c 100644 --- a/unittests/integration/gate_library_tester.cpp +++ b/unittests/integration/gate_library_tester.cpp @@ -88,7 +88,6 @@ CUDAQ_TEST(GateLibraryTester, checkGivensRotationKernelBuilder) { } } - CUDAQ_TEST(GateLibraryTester, checkControlledGivensRotation) { for (const auto &angle : cudaq::linspace(-M_PI, M_PI, NUM_ANGLES)) { // Same check, with 2 control qubits From 31f2b06e14ea287f7012d4d9f996b5cb50a18193 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 00:52:24 +0000 Subject: [PATCH 10/18] Code format --- .../cutensornet/simulator_mps_register.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp index ad53cdfb3c..5605c5dbfa 100644 --- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp @@ -19,6 +19,7 @@ class SimulatorMPS : public SimulatorTensorNetBase { double m_relCutoff = 1e-5; std::vector m_mpsTensors_d; std::vector m_auxQubitsForGateDecomp; + public: SimulatorMPS() : SimulatorTensorNetBase() { if (auto *maxBondEnvVar = std::getenv("CUDAQ_MPS_MAX_BOND")) { @@ -195,10 +196,11 @@ class SimulatorMPS : public SimulatorTensorNetBase { collectControls(controls, aux, 0); adjustForSingleControl(controls, aux); - + // Add to the singly-controlled instruction queue - enqueueQuantumOperation(params, {aux[controls.size() - 2]}, targets); - + enqueueQuantumOperation( + params, {aux[controls.size() - 2]}, targets); + adjustForSingleControl(controls, aux); collectControls(controls, aux, 0); } @@ -268,11 +270,10 @@ class SimulatorMPS : public SimulatorTensorNetBase { decomposeMultiControlledInstruction>({}, ctls, {srcIdx}); } } - - void applyExpPauli(double theta, - const std::vector &controls, - const std::vector &qubitIds, - const cudaq::spin_op &op) override { + + void applyExpPauli(double theta, const std::vector &controls, + const std::vector &qubitIds, + const cudaq::spin_op &op) override { if (op.is_identity()) { if (controls.empty()) { // exp(i*theta*Id) is noop if this is not a controlled gate. From 2f61e494002fac666fff59ac6b2da54ea5abcbc2 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 01:44:07 +0000 Subject: [PATCH 11/18] Test time reduction --- unittests/integration/builder_tester.cpp | 17 ++++++++--------- unittests/integration/gradient_tester.cpp | 5 ++++- unittests/integration/nlopt_tester.cpp | 6 +++++- unittests/integration/vqe_tester.cpp | 5 ++++- unittests/qir/NVQIRTester.cpp | 9 ++++++++- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp index abdac9b870..0b7a554fb6 100644 --- a/unittests/integration/builder_tester.cpp +++ b/unittests/integration/builder_tester.cpp @@ -63,14 +63,14 @@ CUDAQ_TEST(BuilderTester, checkSimple) { }; cudaq::gradients::central_difference gradient(ansatz, argMapper); cudaq::optimizers::lbfgs optimizer; - optimizer.initial_parameters = {0.35, 0.25}; - optimizer.max_eval = 10; - optimizer.max_line_search_trials = 10; + optimizer.initial_parameters = {0.359, 0.257}; + optimizer.max_eval = 4; + optimizer.max_line_search_trials = 8; auto [opt_val_0, optpp] = cudaq::vqe(ansatz, gradient, h3, optimizer, 2, argMapper); printf("Opt-params: %lf %lf \n", optpp[0], optpp[1]); printf("

= %lf\n", opt_val_0); - EXPECT_NEAR(opt_val_0, -2.045375, 1e-3); + EXPECT_NEAR(opt_val_0, -2.045375, 1e-2); } { @@ -97,12 +97,12 @@ CUDAQ_TEST(BuilderTester, checkSimple) { cudaq::gradients::central_difference gradient(ansatz); cudaq::optimizers::lbfgs optimizer; - optimizer.initial_parameters = {0.35, 0.25}; - optimizer.max_eval = 10; - optimizer.max_line_search_trials = 10; + optimizer.initial_parameters = {0.359, 0.257}; + optimizer.max_eval = 4; + optimizer.max_line_search_trials = 8; auto [opt_val_0, optpp] = cudaq::vqe(ansatz, gradient, h3, optimizer, 2); printf("

= %lf\n", opt_val_0); - EXPECT_NEAR(opt_val_0, -2.045375, 1e-3); + EXPECT_NEAR(opt_val_0, -2.045375, 1e-2); } { @@ -125,7 +125,6 @@ CUDAQ_TEST(BuilderTester, checkSimple) { EXPECT_EQ(counter, 1000); } - // MPS doesn't support gates on more than 2 qubits { auto ccnot_builder = cudaq::make_kernel(); auto q = ccnot_builder.qalloc(3); diff --git a/unittests/integration/gradient_tester.cpp b/unittests/integration/gradient_tester.cpp index 2211925c54..49e03ecf28 100644 --- a/unittests/integration/gradient_tester.cpp +++ b/unittests/integration/gradient_tester.cpp @@ -11,7 +11,10 @@ #include #include -#ifndef CUDAQ_BACKEND_DM +// Skip these gradient tests for slow backends to reduce test time. +// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all +// backend-specific functionalities required to interface gradient modules. +#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET struct deuteron_n3_ansatz { void operator()(double x0, double x1) __qpu__ { cudaq::qvector q(3); diff --git a/unittests/integration/nlopt_tester.cpp b/unittests/integration/nlopt_tester.cpp index ea745cad18..197d6ffc7e 100644 --- a/unittests/integration/nlopt_tester.cpp +++ b/unittests/integration/nlopt_tester.cpp @@ -13,7 +13,11 @@ #include #include -#ifndef CUDAQ_BACKEND_DM +// Skip these Nlopt optimizer tests for slow backends to reduce test time. +// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all +// backend-specific functionalities required to interface with optimizers. +#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET + struct deuteron_n3_ansatz { void operator()(double x0, double x1) __qpu__ { cudaq::qvector q(3); diff --git a/unittests/integration/vqe_tester.cpp b/unittests/integration/vqe_tester.cpp index 6d18db99e9..5e4b5b9389 100644 --- a/unittests/integration/vqe_tester.cpp +++ b/unittests/integration/vqe_tester.cpp @@ -13,7 +13,10 @@ #include #include -#ifndef CUDAQ_BACKEND_DM +// Skip these VQE tests for slow backends to reduce test time. +// Note: CUDA-Q API level tests (e.g., `cudaq::observe`) should cover all +// backend-specific functionalities required for the `cudaq::vqe` wrapper. +#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET struct ansatz_compute_action { void operator()(std::vector theta) __qpu__ { diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp index f287877573..d8008e6590 100644 --- a/unittests/qir/NVQIRTester.cpp +++ b/unittests/qir/NVQIRTester.cpp @@ -160,8 +160,15 @@ CUDAQ_TEST(NVQIRTester, checkReset) { Qubit *q1 = *reinterpret_cast( __quantum__rt__array_get_element_ptr_1d(qubits, 1)); +#if defined CUDAQ_BACKEND_TENSORNET + // Tensornet backends doesn't have a qubit count limit, just check that it can + // perform qubit reset in a loop. + constexpr int N_ITERS = 3; +#else + constexpr int N_ITERS = 100; +#endif // Make sure that the state vector doesn't grow with each additional reset - for (int i = 0; i < 100; i++) { + for (int i = 0; i < N_ITERS; i++) { __quantum__qis__reset(q0); __quantum__qis__reset(q1); __quantum__qis__x(q1); From 0cbfcc6eff5a92a1347196651a67f78a2d0984a4 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 02:07:35 +0000 Subject: [PATCH 12/18] More test time reduction --- .../integration/bug67_vqe_then_sample.cpp | 2 +- .../deuteron_variational_tester.cpp | 28 +++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/unittests/integration/bug67_vqe_then_sample.cpp b/unittests/integration/bug67_vqe_then_sample.cpp index 42ea18dfdc..2dbac5ba53 100644 --- a/unittests/integration/bug67_vqe_then_sample.cpp +++ b/unittests/integration/bug67_vqe_then_sample.cpp @@ -12,7 +12,7 @@ #include #include -#ifndef CUDAQ_BACKEND_DM +#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET CUDAQ_TEST(VqeThenSample, checkBug67) { diff --git a/unittests/integration/deuteron_variational_tester.cpp b/unittests/integration/deuteron_variational_tester.cpp index 048080d455..74d8585671 100644 --- a/unittests/integration/deuteron_variational_tester.cpp +++ b/unittests/integration/deuteron_variational_tester.cpp @@ -58,18 +58,13 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) { cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) + .21829 * z(0) - 6.125 * z(1); +#if defined CUDAQ_BACKEND_TENSORNET + // Reduce test time by reducing the broadcast size. + std::vector params{-M_PI, -M_PI + M_2_PI / 49, + -M_PI + 2 * M_2_PI / 49}; + std::vector expected{12.250290, 12.746370, 13.130148}; +#else auto params = cudaq::linspace(-M_PI, M_PI, 50); - - auto ansatz = [](double theta, int size) __qpu__ { - cudaq::qvector q(size); - x(q[0]); - ry(theta, q[1]); - x(q[1], q[0]); - }; - - auto results = cudaq::observe( - ansatz, h, cudaq::make_argset(params, std::vector(params.size(), 2))); - std::vector expected{ 12.250290, 12.746370, 13.130148, 13.395321, 13.537537, 13.554460, 13.445811, 13.213375, 12.860969, 12.394379, 11.821267, 11.151042, @@ -80,6 +75,17 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) { 1.031106, 1.825915, 2.687735, 3.602415, 4.554937, 5.529659, 6.510578, 7.481585, 8.426738, 9.330517, 10.178082, 10.955516, 11.650053, 12.250290}; +#endif + + auto ansatz = [](double theta, int size) __qpu__ { + cudaq::qvector q(size); + x(q[0]); + ry(theta, q[1]); + x(q[1], q[0]); + }; + + auto results = cudaq::observe( + ansatz, h, cudaq::make_argset(params, std::vector(params.size(), 2))); for (std::size_t counter = 0; auto &el : expected) printf("results[%lu] = %.16lf\n", counter++, el); From 48b037ea47d685d87de3eab4f9b7d884dc76a4cb Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 02:32:02 +0000 Subject: [PATCH 13/18] Wrong formula --- unittests/integration/deuteron_variational_tester.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unittests/integration/deuteron_variational_tester.cpp b/unittests/integration/deuteron_variational_tester.cpp index 74d8585671..5f96059930 100644 --- a/unittests/integration/deuteron_variational_tester.cpp +++ b/unittests/integration/deuteron_variational_tester.cpp @@ -60,8 +60,8 @@ CUDAQ_TEST(D2VariationalTester, checkBroadcast) { #if defined CUDAQ_BACKEND_TENSORNET // Reduce test time by reducing the broadcast size. - std::vector params{-M_PI, -M_PI + M_2_PI / 49, - -M_PI + 2 * M_2_PI / 49}; + std::vector params{-M_PI, -M_PI + 2. * M_PI / 49., + -M_PI + 4. * M_PI / 49.}; std::vector expected{12.250290, 12.746370, 13.130148}; #else auto params = cudaq::linspace(-M_PI, M_PI, 50); From 445d7a6c17f583bd65668f30d86e292b70741f96 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 04:01:27 +0000 Subject: [PATCH 14/18] Tidy up the code --- .../cutensornet/simulator_mps_register.cpp | 83 ++++++++++++------- runtime/nvqir/cutensornet/tensornet_state.h | 3 +- 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp index 5605c5dbfa..ed3dac968f 100644 --- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp @@ -18,6 +18,7 @@ class SimulatorMPS : public SimulatorTensorNetBase { // Default relative cutoff double m_relCutoff = 1e-5; std::vector m_mpsTensors_d; + // List of auxiliary qubits that were used for controlled-gate decomposition. std::vector m_auxQubitsForGateDecomp; public: @@ -127,17 +128,21 @@ class SimulatorMPS : public SimulatorTensorNetBase { (1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size())); const std::vector projectedModes(m_auxQubitsForGateDecomp.begin(), m_auxQubitsForGateDecomp.end()); + // Returns the main qubit register state (auxiliary qubits are projected to + // zero state) return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)}; } - size_t addAuxQubit() { + std::vector addAuxQubits(std::size_t n) { if (m_state->isDirty()) throw std::runtime_error( "[MPS Simulator] Unable to perform multi-control gate decomposition " "due to dynamical circuits."); - m_state = std::make_unique(m_state->getNumQubits() + 1, + std::vector aux(n); + std::iota(aux.begin(), aux.end(), m_state->getNumQubits()); + m_state = std::make_unique(m_state->getNumQubits() + n, m_cutnHandle); - return m_state->getNumQubits() - 1; + return aux; } template @@ -150,6 +155,7 @@ class SimulatorMPS : public SimulatorTensorNetBase { return; } + // CCNOT decomposition const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) { enqueueQuantumOperation>({}, {}, {c}); enqueueQuantumOperation>({}, {b}, {c}); @@ -168,43 +174,55 @@ class SimulatorMPS : public SimulatorTensorNetBase { enqueueQuantumOperation>({}, {a}, {b}); }; + // Collects the given list of control qubits into the given auxiliary + // qubits, using all but the last qubits in the auxiliary list as scratch + // qubits. + // + // For example, if the controls list is 6 qubits, the auxiliary list must be + // 5 qubits, and the state from the 6 control qubits will be collected into + // the last qubit of the auxiliary array. const auto collectControls = [&](const std::vector &ctls, const std::vector &aux, - int adjustment) { - for (int i = 0; i < static_cast(ctls.size()) - 1; i += 2) { - ccnot(ctls[i], ctls[i + 1], aux[i / 2]); - } - for (int i = 0; i < static_cast(ctls.size()) / 2 - 1 - adjustment; - ++i) { - ccnot(aux[i * 2], aux[(i * 2) + 1], aux[i + ctls.size() / 2]); - } + bool reverse = false) { + std::vector> ccnotList; + for (int i = 0; i < static_cast(ctls.size()) - 1; i += 2) + ccnotList.emplace_back( + std::make_tuple(ctls[i], ctls[i + 1], aux[i / 2])); + + for (int i = 0; i < static_cast(ctls.size()) / 2 - 1; ++i) + ccnotList.emplace_back(std::make_tuple(aux[i * 2], aux[(i * 2) + 1], + aux[i + ctls.size() / 2])); + + if (ctls.size() % 2 != 0) + ccnotList.emplace_back(std::make_tuple( + ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2])); + + if (reverse) + std::reverse(ccnotList.begin(), ccnotList.end()); + + for (const auto &[a, b, c] : ccnotList) + ccnot(a, b, c); }; - const auto adjustForSingleControl = - [&](const std::vector &ctls, - const std::vector &aux) { - if (ctls.size() % 2 != 0) - ccnot(ctls[ctls.size() - 1], aux[ctls.size() - 3], - aux[ctls.size() - 2]); - }; - - std::vector aux; - for (std::size_t i = 0; i < controls.size() - 1; ++i) { - const auto auxQubit = addAuxQubit(); - m_auxQubitsForGateDecomp.emplace_back(auxQubit); - aux.emplace_back(auxQubit); + + if (m_auxQubitsForGateDecomp.size() < controls.size() - 1) { + const auto aux = + addAuxQubits(controls.size() - 1 - m_auxQubitsForGateDecomp.size()); + m_auxQubitsForGateDecomp.insert(m_auxQubitsForGateDecomp.end(), + aux.begin(), aux.end()); } - collectControls(controls, aux, 0); - adjustForSingleControl(controls, aux); + collectControls(controls, m_auxQubitsForGateDecomp); // Add to the singly-controlled instruction queue enqueueQuantumOperation( - params, {aux[controls.size() - 2]}, targets); + params, {m_auxQubitsForGateDecomp[controls.size() - 2]}, targets); - adjustForSingleControl(controls, aux); - collectControls(controls, aux, 0); - } + collectControls(controls, m_auxQubitsForGateDecomp, true); + }; +// Gate implementations: +// Here, we forward all the call to the multi-control decomposition helper. +// Decomposed gates are added to the queue. #define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME) \ using CircuitSimulator::NAME; \ void NAME(const std::vector &controls, \ @@ -249,11 +267,14 @@ class SimulatorMPS : public SimulatorTensorNetBase { #undef CIRCUIT_SIMULATOR_ONE_QUBIT #undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM + // Swap gate implementation using CircuitSimulator::swap; void swap(const std::vector &ctrlBits, const std::size_t srcIdx, const std::size_t tgtIdx) override { if (ctrlBits.empty()) return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx); + // Controlled swap gate: using cnot decomposition of swap gate to perform + // decomposition. { std::vector ctls = ctrlBits; ctls.emplace_back(tgtIdx); @@ -271,6 +292,8 @@ class SimulatorMPS : public SimulatorTensorNetBase { } } + // `exp-pauli` gate implementation: forward the middle-controlled Rz to the + // decomposition helper. void applyExpPauli(double theta, const std::vector &controls, const std::vector &qubitIds, const cudaq::spin_op &op) override { diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h index 7416b3807b..89992eedad 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.h +++ b/runtime/nvqir/cutensornet/tensornet_state.h @@ -19,6 +19,7 @@ class TensorNetState { std::size_t m_numQubits; cutensornetHandle_t m_cutnHandle; cutensornetState_t m_quantumState; + // Track id of tensors that are applied to the state tensors. int64_t m_tensorId = -1; public: @@ -75,7 +76,7 @@ class TensorNetState { /// @brief Number of qubits that this state represents. std::size_t getNumQubits() const { return m_numQubits; } - + /// @brief True if the state contains gate tensors (not just initial qubit /// tensors) bool isDirty() const { return m_tensorId > 0; } From 3e392b95114bd3bb06d174758fdab9e01ef45d13 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 04:31:54 +0000 Subject: [PATCH 15/18] Code reorder --- runtime/nvqir/cutensornet/simulator_mps_register.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp index ed3dac968f..043715ff05 100644 --- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp @@ -111,9 +111,9 @@ class SimulatorMPS : public SimulatorTensorNetBase { m_mpsTensors_d.clear(); } - void resetExecutionContext() override { - SimulatorTensorNetBase::resetExecutionContext(); + void deallocateStateImpl() override { m_auxQubitsForGateDecomp.clear(); + SimulatorTensorNetBase::deallocateStateImpl(); } /// @brief Return the state vector data From 80a8850e7795c1f2780df85fb25dacdf9958380e Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 04:37:47 +0000 Subject: [PATCH 16/18] Typo fix --- unittests/qir/NVQIRTester.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp index d8008e6590..90da808f38 100644 --- a/unittests/qir/NVQIRTester.cpp +++ b/unittests/qir/NVQIRTester.cpp @@ -161,7 +161,7 @@ CUDAQ_TEST(NVQIRTester, checkReset) { __quantum__rt__array_get_element_ptr_1d(qubits, 1)); #if defined CUDAQ_BACKEND_TENSORNET - // Tensornet backends doesn't have a qubit count limit, just check that it can + // Tensornet backends don't have a qubit count limit, just check that it can // perform qubit reset in a loop. constexpr int N_ITERS = 3; #else From d62007a492ba73558acebef6e37b0eabbfc2593e Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 20 Mar 2024 21:06:20 +0000 Subject: [PATCH 17/18] Code review: optimize control qubit vector usage --- runtime/nvqir/cutensornet/simulator_mps_register.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp index 043715ff05..35a8082990 100644 --- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp @@ -275,19 +275,19 @@ class SimulatorMPS : public SimulatorTensorNetBase { return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx); // Controlled swap gate: using cnot decomposition of swap gate to perform // decomposition. + const auto size = ctrlBits.size(); + std::vector ctls(size + 1); + std::copy(ctrlBits.begin(), ctrlBits.end(), ctls.begin()); { - std::vector ctls = ctrlBits; - ctls.emplace_back(tgtIdx); + ctls[size] = tgtIdx; decomposeMultiControlledInstruction>({}, ctls, {srcIdx}); } { - std::vector ctls = ctrlBits; - ctls.emplace_back(srcIdx); + ctls[size] = srcIdx; decomposeMultiControlledInstruction>({}, ctls, {tgtIdx}); } { - std::vector ctls = ctrlBits; - ctls.emplace_back(tgtIdx); + ctls[size] = tgtIdx; decomposeMultiControlledInstruction>({}, ctls, {srcIdx}); } } From bdce5e63d490551986f0191b6cc5fa45e943b527 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Mon, 1 Apr 2024 19:46:28 +0000 Subject: [PATCH 18/18] Add InvalidTensorIndexValue constant for clarity Co-authored-by: Eric Schweitz --- runtime/nvqir/cutensornet/tensornet_state.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h index 89992eedad..5cf9afbf57 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.h +++ b/runtime/nvqir/cutensornet/tensornet_state.h @@ -13,14 +13,18 @@ #include namespace nvqir { +/// This is used to track whether the tensor state is default initialized vs +/// already has some gates applied to. +constexpr std::int64_t InvalidTensorIndexValue = -1; + /// @brief Wrapper of cutensornetState_t to provide convenient API's for CUDAQ /// simulator implementation. class TensorNetState { std::size_t m_numQubits; cutensornetHandle_t m_cutnHandle; cutensornetState_t m_quantumState; - // Track id of tensors that are applied to the state tensors. - int64_t m_tensorId = -1; + /// Track id of gate tensors that are applied to the state tensors. + std::int64_t m_tensorId = InvalidTensorIndexValue; public: /// @brief Constructor