Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPS backend to support gates on 3+ qubits #1420

Merged
merged 22 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions runtime/nvqir/cutensornet/simulator_mps_register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class SimulatorMPS : public SimulatorTensorNetBase {
// Default relative cutoff
double m_relCutoff = 1e-5;
std::vector<void *> m_mpsTensors_d;
// List of auxiliary qubits that were used for controlled-gate decomposition.
std::vector<std::size_t> m_auxQubitsForGateDecomp;

public:
SimulatorMPS() : SimulatorTensorNetBase() {
Expand Down Expand Up @@ -108,6 +110,245 @@ class SimulatorMPS : public SimulatorTensorNetBase {
}
m_mpsTensors_d.clear();
}

void deallocateStateImpl() override {
m_auxQubitsForGateDecomp.clear();
SimulatorTensorNetBase::deallocateStateImpl();
}

/// @brief Return the state vector data
cudaq::State getStateData() override {
LOG_API_TIME();
if (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size() > 64)
throw std::runtime_error("State vector data is too large.");
// Handle empty state (e.g., no qubit allocation)
if (!m_state)
return cudaq::State{{0}, {}};
const uint64_t svDim =
(1ull << (m_state->getNumQubits() - m_auxQubitsForGateDecomp.size()));
const std::vector<int32_t> projectedModes(m_auxQubitsForGateDecomp.begin(),
m_auxQubitsForGateDecomp.end());
// Returns the main qubit register state (auxiliary qubits are projected to
// zero state)
return cudaq::State{{svDim}, m_state->getStateVector(projectedModes)};
}

std::vector<size_t> addAuxQubits(std::size_t n) {
if (m_state->isDirty())
throw std::runtime_error(
"[MPS Simulator] Unable to perform multi-control gate decomposition "
"due to dynamical circuits.");
std::vector<size_t> aux(n);
std::iota(aux.begin(), aux.end(), m_state->getNumQubits());
m_state = std::make_unique<TensorNetState>(m_state->getNumQubits() + n,
m_cutnHandle);
return aux;
}

template <typename QuantumOperation>
void
decomposeMultiControlledInstruction(const std::vector<double> &params,
const std::vector<std::size_t> &controls,
const std::vector<std::size_t> &targets) {
if (controls.size() <= 1) {
enqueueQuantumOperation<QuantumOperation>(params, controls, targets);
return;
}

// CCNOT decomposition
const auto ccnot = [&](std::size_t a, std::size_t b, std::size_t c) {
enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::x<double>>({}, {b}, {c});
enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {c});
enqueueQuantumOperation<nvqir::t<double>>({}, {}, {b});
enqueueQuantumOperation<nvqir::t<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::h<double>>({}, {}, {c});
enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
enqueueQuantumOperation<nvqir::t<double>>({}, {}, {a});
enqueueQuantumOperation<nvqir::tdg<double>>({}, {}, {b});
enqueueQuantumOperation<nvqir::x<double>>({}, {a}, {b});
};

// Collects the given list of control qubits into the given auxiliary
// qubits, using all but the last qubits in the auxiliary list as scratch
// qubits.
//
// For example, if the controls list is 6 qubits, the auxiliary list must be
// 5 qubits, and the state from the 6 control qubits will be collected into
// the last qubit of the auxiliary array.
const auto collectControls = [&](const std::vector<std::size_t> &ctls,
const std::vector<std::size_t> &aux,
bool reverse = false) {
std::vector<std::tuple<std::size_t, std::size_t, std::size_t>> ccnotList;
for (int i = 0; i < static_cast<int>(ctls.size()) - 1; i += 2)
ccnotList.emplace_back(
std::make_tuple(ctls[i], ctls[i + 1], aux[i / 2]));

for (int i = 0; i < static_cast<int>(ctls.size()) / 2 - 1; ++i)
ccnotList.emplace_back(std::make_tuple(aux[i * 2], aux[(i * 2) + 1],
aux[i + ctls.size() / 2]));

if (ctls.size() % 2 != 0)
ccnotList.emplace_back(std::make_tuple(
ctls[ctls.size() - 1], aux[ctls.size() - 3], aux[ctls.size() - 2]));

if (reverse)
std::reverse(ccnotList.begin(), ccnotList.end());

for (const auto &[a, b, c] : ccnotList)
ccnot(a, b, c);
};

if (m_auxQubitsForGateDecomp.size() < controls.size() - 1) {
const auto aux =
addAuxQubits(controls.size() - 1 - m_auxQubitsForGateDecomp.size());
m_auxQubitsForGateDecomp.insert(m_auxQubitsForGateDecomp.end(),
aux.begin(), aux.end());
}

collectControls(controls, m_auxQubitsForGateDecomp);

// Add to the singly-controlled instruction queue
enqueueQuantumOperation<QuantumOperation>(
params, {m_auxQubitsForGateDecomp[controls.size() - 2]}, targets);

collectControls(controls, m_auxQubitsForGateDecomp, true);
};

// Gate implementations:
// Here, we forward all the call to the multi-control decomposition helper.
// Decomposed gates are added to the queue.
#define CIRCUIT_SIMULATOR_ONE_QUBIT(NAME) \
using CircuitSimulator::NAME; \
void NAME(const std::vector<std::size_t> &controls, \
const std::size_t qubitIdx) override { \
decomposeMultiControlledInstruction<nvqir::NAME<double>>( \
{}, controls, std::vector<std::size_t>{qubitIdx}); \
}

#define CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(NAME) \
using CircuitSimulator::NAME; \
void NAME(const double angle, const std::vector<std::size_t> &controls, \
const std::size_t qubitIdx) override { \
decomposeMultiControlledInstruction<nvqir::NAME<double>>( \
{angle}, controls, std::vector<std::size_t>{qubitIdx}); \
}

/// @brief The X gate
CIRCUIT_SIMULATOR_ONE_QUBIT(x)
/// @brief The Y gate
CIRCUIT_SIMULATOR_ONE_QUBIT(y)
/// @brief The Z gate
CIRCUIT_SIMULATOR_ONE_QUBIT(z)
/// @brief The H gate
CIRCUIT_SIMULATOR_ONE_QUBIT(h)
/// @brief The S gate
CIRCUIT_SIMULATOR_ONE_QUBIT(s)
/// @brief The T gate
CIRCUIT_SIMULATOR_ONE_QUBIT(t)
/// @brief The Sdg gate
CIRCUIT_SIMULATOR_ONE_QUBIT(sdg)
/// @brief The Tdg gate
CIRCUIT_SIMULATOR_ONE_QUBIT(tdg)
/// @brief The RX gate
CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rx)
/// @brief The RY gate
CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(ry)
/// @brief The RZ gate
CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(rz)
/// @brief The Phase gate
CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM(r1)
// Undef those preprocessor defines.
#undef CIRCUIT_SIMULATOR_ONE_QUBIT
#undef CIRCUIT_SIMULATOR_ONE_QUBIT_ONE_PARAM

// Swap gate implementation
using CircuitSimulator::swap;
void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
const std::size_t tgtIdx) override {
if (ctrlBits.empty())
return SimulatorTensorNetBase::swap(ctrlBits, srcIdx, tgtIdx);
// Controlled swap gate: using cnot decomposition of swap gate to perform
// decomposition.
const auto size = ctrlBits.size();
std::vector<std::size_t> ctls(size + 1);
std::copy(ctrlBits.begin(), ctrlBits.end(), ctls.begin());
{
ctls[size] = tgtIdx;
decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
}
{
ctls[size] = srcIdx;
decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {tgtIdx});
}
{
ctls[size] = tgtIdx;
decomposeMultiControlledInstruction<nvqir::x<double>>({}, ctls, {srcIdx});
}
}

// `exp-pauli` gate implementation: forward the middle-controlled Rz to the
// decomposition helper.
void applyExpPauli(double theta, const std::vector<std::size_t> &controls,
const std::vector<std::size_t> &qubitIds,
const cudaq::spin_op &op) override {
if (op.is_identity()) {
if (controls.empty()) {
// exp(i*theta*Id) is noop if this is not a controlled gate.
return;
} else {
// Throw an error if this exp_pauli(i*theta*Id) becomes a non-trivial
// gate due to control qubits.
// FIXME: revisit this once
// https://github.com/NVIDIA/cuda-quantum/issues/483 is implemented.
throw std::logic_error("Applying controlled global phase via exp_pauli "
"of identity operator is not supported");
}
}
std::vector<std::size_t> qubitSupport;
std::vector<std::function<void(bool)>> basisChange;
op.for_each_pauli([&](cudaq::pauli type, std::size_t qubitIdx) {
if (type != cudaq::pauli::I)
qubitSupport.push_back(qubitIds[qubitIdx]);

if (type == cudaq::pauli::Y)
basisChange.emplace_back([&, qubitIdx](bool reverse) {
rx(!reverse ? M_PI_2 : -M_PI_2, qubitIds[qubitIdx]);
});
else if (type == cudaq::pauli::X)
basisChange.emplace_back(
[&, qubitIdx](bool) { h(qubitIds[qubitIdx]); });
});

if (!basisChange.empty())
for (auto &basis : basisChange)
basis(false);

std::vector<std::pair<std::size_t, std::size_t>> toReverse;
for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
x({qubitSupport[i]}, qubitSupport[i + 1]);
toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
}

// Perform multi-control decomposition.
decomposeMultiControlledInstruction<nvqir::rz<double>>(
{-2.0 * theta}, controls, {qubitSupport.back()});

std::reverse(toReverse.begin(), toReverse.end());
for (auto &[i, j] : toReverse)
x({i}, j);

if (!basisChange.empty()) {
std::reverse(basisChange.begin(), basisChange.end());
for (auto &basis : basisChange)
basis(true);
}
}
};
} // end namespace nvqir

Expand Down
28 changes: 15 additions & 13 deletions runtime/nvqir/cutensornet/tensornet_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,17 @@ TensorNetState::TensorNetState(std::size_t numQubits,
void TensorNetState::applyGate(const std::vector<int32_t> &qubitIds,
void *gateDeviceMem, bool adjoint) {

int64_t id = 0;
HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(
m_cutnHandle, m_quantumState, qubitIds.size(), qubitIds.data(),
gateDeviceMem, nullptr, /*immutable*/ 1,
/*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &id));
/*adjoint*/ static_cast<int32_t>(adjoint), /*unitary*/ 1, &m_tensorId));
}

void TensorNetState::applyQubitProjector(void *proj_d, int32_t qubitIdx) {
int64_t id = 0;
HANDLE_CUTN_ERROR(
cutensornetStateApplyTensor(m_cutnHandle, m_quantumState, 1, &qubitIdx,
proj_d, nullptr, /*immutable*/ 1,
/*adjoint*/ 0, /*unitary*/ 0, &id));
/*adjoint*/ 0, /*unitary*/ 0, &m_tensorId));
}

std::unordered_map<std::string, size_t>
Expand Down Expand Up @@ -120,24 +118,26 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
return counts;
}

std::vector<std::complex<double>> TensorNetState::getStateVector() {
std::vector<std::complex<double>>
TensorNetState::getStateVector(const std::vector<int32_t> &projectedModes) {
// Make sure that we don't overflow the memory size calculation.
// Note: the actual limitation will depend on the system memory.
if (m_numQubits > 64 ||
(1ull << m_numQubits) >
if ((m_numQubits - projectedModes.size()) > 64 ||
(1ull << (m_numQubits - projectedModes.size())) >
std::numeric_limits<uint64_t>::max() / sizeof(std::complex<double>))
throw std::runtime_error(
"Too many qubits are requested for full state vector contraction.");
LOG_API_TIME();
void *d_sv{nullptr};
const uint64_t svDim = 1ull << m_numQubits;
const uint64_t svDim = 1ull << (m_numQubits - projectedModes.size());
HANDLE_CUDA_ERROR(cudaMalloc(&d_sv, svDim * sizeof(std::complex<double>)));
ScratchDeviceMem scratchPad;

// Create the quantum state amplitudes accessor
cutensornetStateAccessor_t accessor;
HANDLE_CUTN_ERROR(cutensornetCreateAccessor(m_cutnHandle, m_quantumState, 0,
nullptr, nullptr, &accessor));
HANDLE_CUTN_ERROR(cutensornetCreateAccessor(
m_cutnHandle, m_quantumState, projectedModes.size(),
projectedModes.data(), nullptr, &accessor));

const int32_t numHyperSamples =
8; // desired number of hyper samples used in the tensor network
Expand Down Expand Up @@ -167,9 +167,11 @@ std::vector<std::complex<double>> TensorNetState::getStateVector() {

// Compute the quantum state amplitudes
std::complex<double> stateNorm{0.0, 0.0};
HANDLE_CUTN_ERROR(
cutensornetAccessorCompute(m_cutnHandle, accessor, nullptr, workDesc,
d_sv, static_cast<void *>(&stateNorm), 0));
// All projected modes are assumed to be projected to 0.
std::vector<int64_t> projectedModeValues(projectedModes.size(), 0);
HANDLE_CUTN_ERROR(cutensornetAccessorCompute(
m_cutnHandle, accessor, projectedModeValues.data(), workDesc, d_sv,
static_cast<void *>(&stateNorm), 0));
std::vector<std::complex<double>> h_sv(svDim);
HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv,
svDim * sizeof(std::complex<double>),
Expand Down
12 changes: 11 additions & 1 deletion runtime/nvqir/cutensornet/tensornet_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,18 @@
#include <unordered_map>

namespace nvqir {
/// This is used to track whether the tensor state is default initialized vs
/// already has some gates applied to.
constexpr std::int64_t InvalidTensorIndexValue = -1;

/// @brief Wrapper of cutensornetState_t to provide convenient API's for CUDAQ
/// simulator implementation.
class TensorNetState {
std::size_t m_numQubits;
cutensornetHandle_t m_cutnHandle;
cutensornetState_t m_quantumState;
/// Track id of gate tensors that are applied to the state tensors.
std::int64_t m_tensorId = InvalidTensorIndexValue;

public:
/// @brief Constructor
Expand All @@ -45,7 +51,8 @@ class TensorNetState {

/// @brief Contract the tensor network representation to retrieve the state
/// vector.
std::vector<std::complex<double>> getStateVector();
std::vector<std::complex<double>>
getStateVector(const std::vector<int32_t> &projectedModes = {});

/// @brief Compute the reduce density matrix on a set of qubits
///
Expand Down Expand Up @@ -74,6 +81,9 @@ class TensorNetState {
/// @brief Number of qubits that this state represents.
std::size_t getNumQubits() const { return m_numQubits; }

/// @brief True if the state contains gate tensors (not just initial qubit
/// tensors)
bool isDirty() const { return m_tensorId > 0; }
/// @brief Destructor
~TensorNetState();
};
Expand Down
2 changes: 1 addition & 1 deletion unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER)
set(TEST_LABELS "gpu_required")
endif()
if (${NVQIR_BACKEND} STREQUAL "tensornet-mps")
target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_BACKEND_TENSORNET_MPS)
target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET)
set(TEST_LABELS "gpu_required")
endif()
if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32")
Expand Down
2 changes: 1 addition & 1 deletion unittests/integration/bug67_vqe_then_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cudaq/optimizers.h>
#include <cudaq/platform.h>

#ifndef CUDAQ_BACKEND_DM
#if !defined CUDAQ_BACKEND_DM && !defined CUDAQ_BACKEND_TENSORNET

CUDAQ_TEST(VqeThenSample, checkBug67) {

Expand Down
Loading
Loading