Skip to content

Commit

Permalink
Add support for recently introduced cudaq::range(a,b,c) function. (#1476
Browse files Browse the repository at this point in the history
)

This allows the recently introduced range function to be handled in the
C++ bridge correctly.
  • Loading branch information
schweitzpgi committed Apr 8, 2024
1 parent 1c434b1 commit edac569
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 52 deletions.
13 changes: 13 additions & 0 deletions include/cudaq/Optimizer/Builder/Factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,19 @@ createInvariantLoop(mlir::OpBuilder &builder, mlir::Location loc,
mlir::Region &, mlir::Block &)>
bodyBuilder);

/// Builds a monotonic loop. A monotonic loop is a loop that is guaranteed to
/// execute the body of the loop from \p start to (but not including) \p stop
/// stepping by \p step times. Exceptional conditions will cause the loop body
/// to execute 0 times. Early exits are not allowed. This builder threads the
/// loop control value, which will be returned as the value \p stop (or the next
/// value near \p stop) when the loop exits.
cc::LoopOp
createMonotonicLoop(mlir::OpBuilder &builder, mlir::Location loc,
mlir::Value start, mlir::Value stop, mlir::Value step,
llvm::function_ref<void(mlir::OpBuilder &, mlir::Location,
mlir::Region &, mlir::Block &)>
bodyBuilder);

bool hasHiddenSRet(mlir::FunctionType funcTy);

/// Convert the function type \p funcTy to a signature compatible with the code
Expand Down
14 changes: 14 additions & 0 deletions include/cudaq/Optimizer/Builder/Intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,23 @@ namespace cudaq {

static constexpr const char llvmMemCopyIntrinsic[] =
"llvm.memcpy.p0i8.p0i8.i64";

// cudaq::range(count);
static constexpr const char setCudaqRangeVector[] = "__nvqpp_CudaqRangeInit";
// cudaq::range(start, stop, step);
static constexpr const char setCudaqRangeVectorTriple[] =
"__nvqpp_CudaqRangeInitTriple";
// Computes the number of iterations as from a semi-open interval as given by a
// cudaq::range() triple.
static constexpr const char getCudaqSizeFromTriple[] =
"__nvqpp_CudaqSizeFromTriple";

// Convert a sequence of booleans (as bytes) into a std::vector<bool> (which is
// typically specialized to be bit packed).
static constexpr const char stdvecBoolCtorFromInitList[] =
"__nvqpp_initializer_list_to_vector_bool";
// Convert a (likely packed) std::vector<bool> into a sequence of bytes, each
// holding a boolean value.
static constexpr const char stdvecBoolUnpackToInitList[] =
"__nvqpp_vector_bool_to_initializer_list";

Expand Down
37 changes: 29 additions & 8 deletions lib/Frontend/nvqpp/ConvertExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1873,17 +1873,38 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
auto *block = builder.getBlock();
IRBuilder irBuilder(builder.getContext());
auto mod = block->getParentOp()->getParentOfType<ModuleOp>();
auto i64Ty = builder.getI64Type(); // element type
if (funcArity == 1) {
[[maybe_unused]] auto result =
irBuilder.loadIntrinsic(mod, setCudaqRangeVector);
assert(succeeded(result) && "loading intrinsic should never fail");
auto upVal = args[0];
auto upper = builder.create<cc::CastOp>(loc, i64Ty, upVal,
cc::CastOpMode::Unsigned);
auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, upper);
auto stdvecTy = cc::StdvecType::get(i64Ty);
auto call = builder.create<func::CallOp>(
loc, stdvecTy, setCudaqRangeVector, ValueRange{buffer, upper});
return pushValue(call.getResult(0));
}
assert(funcArity == 3);
[[maybe_unused]] auto result =
irBuilder.loadIntrinsic(mod, setCudaqRangeVector);
irBuilder.loadIntrinsic(mod, setCudaqRangeVectorTriple);
assert(succeeded(result) && "loading intrinsic should never fail");
auto upVal = args[0];
auto i64Ty = builder.getI64Type(); // element type
auto upper = builder.create<cc::CastOp>(loc, i64Ty, upVal,
cc::CastOpMode::Unsigned);
auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, upper);
Value start = builder.create<cc::CastOp>(loc, i64Ty, args[0],
cc::CastOpMode::Signed);
Value stop = builder.create<cc::CastOp>(loc, i64Ty, args[1],
cc::CastOpMode::Signed);
Value step = builder.create<cc::CastOp>(loc, i64Ty, args[2],
cc::CastOpMode::Signed);
auto lengthCall = builder.create<func::CallOp>(
loc, i64Ty, getCudaqSizeFromTriple, ValueRange{start, stop, step});
Value length = lengthCall.getResult(0);
auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, length);
auto stdvecTy = cc::StdvecType::get(i64Ty);
auto call = builder.create<func::CallOp>(
loc, stdvecTy, setCudaqRangeVector, ValueRange{buffer, upper});
auto call =
builder.create<func::CallOp>(loc, stdvecTy, setCudaqRangeVectorTriple,
ValueRange{buffer, start, stop, step});
return pushValue(call.getResult(0));
}

Expand Down
41 changes: 34 additions & 7 deletions lib/Frontend/nvqpp/ConvertStmt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
auto eleTy = stdvecTy.getElementType();
auto dataPtrTy = cc::PointerType::get(eleTy);
auto dataArrPtrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
auto [iters, ptr] = [&]() -> std::pair<Value, Value> {
if (auto call = buffer.getDefiningOp<func::CallOp>())
auto [iters, ptr, initial,
stepBy] = [&]() -> std::tuple<Value, Value, Value, Value> {
if (auto call = buffer.getDefiningOp<func::CallOp>()) {
if (call.getCallee().equals(setCudaqRangeVector)) {
// The std::vector was produced by cudaq::range(). Optimize this
// special case to use the loop control directly. Erase the transient
Expand All @@ -148,11 +149,30 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
// shouldn't get here, but we can erase the call at minimum
call->erase();
}
return {i, {}};
return {i, {}, {}, {}};
} else if (call.getCallee().equals(setCudaqRangeVectorTriple)) {
Value i = call.getOperand(2);
if (auto alloc = call.getOperand(0).getDefiningOp<cc::AllocaOp>()) {
Operation *callGetSizeOp = nullptr;
if (auto seqSize = alloc.getSeqSize()) {
if (auto callSize = seqSize.getDefiningOp<func::CallOp>())
if (callSize.getCallee().equals(getCudaqSizeFromTriple))
callGetSizeOp = callSize.getOperation();
}
call->erase(); // erase call must be first
alloc->erase();
if (callGetSizeOp)
callGetSizeOp->erase();
} else {
// shouldn't get here, but we can erase the call at minimum
call->erase();
}
return {i, {}, call.getOperand(1), call.getOperand(3)};
}
}
Value i = builder.create<cc::StdvecSizeOp>(loc, i64Ty, buffer);
Value p = builder.create<cc::StdvecDataOp>(loc, dataArrPtrTy, buffer);
return {i, p};
return {i, p, {}, {}};
}();

auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region,
Expand Down Expand Up @@ -196,9 +216,16 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
builder.create<cc::ScopeOp>(loc, scopeBuilder);
};

auto idxIters = builder.create<cudaq::cc::CastOp>(
loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
if (!initial) {
auto idxIters = builder.create<cudaq::cc::CastOp>(
loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
} else {
auto idxIters = builder.create<cudaq::cc::CastOp>(
loc, i64Ty, iters, cudaq::cc::CastOpMode::Signed);
opt::factory::createMonotonicLoop(builder, loc, initial, idxIters, stepBy,
bodyBuilder);
}
} else if (auto veqTy = dyn_cast<quake::VeqType>(buffer.getType());
veqTy && veqTy.hasSpecifiedSize()) {
Value iters =
Expand Down
68 changes: 61 additions & 7 deletions lib/Optimizer/Builder/Factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/

#include "cudaq/Optimizer/Builder/Factory.h"
#include "cudaq/Optimizer/Builder/Intrinsics.h"
#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
Expand Down Expand Up @@ -175,24 +175,21 @@ cc::LoopOp factory::createInvariantLoop(
auto loop = builder.create<cc::LoopOp>(
loc, resultTys, inputs, /*postCondition=*/false,
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{zero.getType()});
cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
auto &block = *builder.getBlock();
Value cmpi = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::slt, block.getArgument(0),
totalIterations);
builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
},
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{zero.getType()});
cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
auto &block = *builder.getBlock();
bodyBuilder(builder, loc, region, block);
builder.create<cc::ContinueOp>(loc, block.getArguments());
},
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{zero.getType()});
cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
auto &block = *builder.getBlock();
auto incr =
builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
Expand All @@ -202,6 +199,63 @@ cc::LoopOp factory::createInvariantLoop(
return loop;
}

// This builder will transform the monotonic loop into an invariant loop during
// construction. This is meant to save some time in loop analysis and
// normalization, which would perform a similar transformation.
cc::LoopOp factory::createMonotonicLoop(
OpBuilder &builder, Location loc, Value start, Value stop, Value step,
llvm::function_ref<void(OpBuilder &, Location, Region &, Block &)>
bodyBuilder) {
IRBuilder irBuilder(builder.getContext());
auto mod = builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
[[maybe_unused]] auto loadedIntrinsic =
irBuilder.loadIntrinsic(mod, getCudaqSizeFromTriple);
assert(succeeded(loadedIntrinsic) && "loading intrinsic should never fail");
auto i64Ty = builder.getI64Type();
Value begin =
builder.create<cc::CastOp>(loc, i64Ty, start, cc::CastOpMode::Signed);
Value stepBy =
builder.create<cc::CastOp>(loc, i64Ty, step, cc::CastOpMode::Signed);
Value end =
builder.create<cc::CastOp>(loc, i64Ty, stop, cc::CastOpMode::Signed);
Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
SmallVector<Value> inputs = {zero, begin};
SmallVector<Type> resultTys = {i64Ty, i64Ty};
auto totalIters = builder.create<func::CallOp>(
loc, i64Ty, getCudaqSizeFromTriple, ValueRange{begin, end, stepBy});
auto loop = builder.create<cc::LoopOp>(
loc, resultTys, inputs, /*postCondition=*/false,
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{i64Ty, i64Ty});
auto &block = *builder.getBlock();
Value cmpi = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::slt, block.getArgument(0),
totalIters.getResult(0));
builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
},
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{i64Ty, i64Ty});
auto &block = *builder.getBlock();
bodyBuilder(builder, loc, region, block);
builder.create<cc::ContinueOp>(loc, block.getArguments());
},
[&](OpBuilder &builder, Location loc, Region &region) {
cc::RegionBuilderGuard guard(builder, loc, region,
TypeRange{i64Ty, i64Ty});
auto &block = *builder.getBlock();
auto one = builder.create<arith::ConstantIntOp>(loc, 1, 64);
Value count =
builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
Value incr =
builder.create<arith::AddIOp>(loc, block.getArgument(1), stepBy);
builder.create<cc::ContinueOp>(loc, ValueRange{count, incr});
});
loop->setAttr("invariant", builder.getUnitAttr());
return loop;
}

// FIXME: some ABIs may return a small struct in registers rather than via an
// sret pointer.
//
Expand Down
68 changes: 67 additions & 1 deletion lib/Optimizer/Builder/Intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,77 @@ static constexpr IntrinsicCode intrinsicTable[] = {
%one = arith.constant 1 : i64
%s1 = arith.addi %i, %one : i64
cc.continue %s1 : i64
}
} {invariant}
%2 = cc.stdvec_init %arg0, %arg1 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
return %2 : !cc.stdvec<i64>
})#"},

// Compute and initialize a vector from a semi-open triple style notation.
// The vector returned will contain the ordered set defined by the triple.
// That set is specifically `{ i, i+s, i+2*s, ... i+(n-1)*s }` where `i` is
// the initial value `%arg1`, `s` is the step, `%arg3`, and the value
// `i+(n-1)*s` is strictly in the interval `[%arg1 .. %arg2)` or `(%arg2 ..
// %arg1]` depending on whether `%arg3` is positive or negative. Invalid
// triples, such as the step being zero or the lower and upper bounds being
// transposed will return a vector of length 0 (an empty set). Note that all
// three parameters are assumed to be signed values, which is required to
// have a decrementing loop.
{cudaq::setCudaqRangeVectorTriple,
{cudaq::getCudaqSizeFromTriple},
R"#(
func.func private @__nvqpp_CudaqRangeInitTriple(%arg0: !cc.ptr<!cc.array<i64 x ?>>, %arg1: i64, %arg2: i64, %arg3: i64) -> !cc.stdvec<i64> {
%c1_i64 = arith.constant 1 : i64
%c0_i64 = arith.constant 0 : i64
%0 = call @__nvqpp_CudaqSizeFromTriple(%arg1, %arg2, %arg3) : (i64, i64, i64) -> i64
%1:2 = cc.loop while ((%arg4 = %c0_i64, %arg5 = %arg1) -> (i64, i64)) {
%3 = arith.cmpi ult, %arg4, %0 : i64
cc.condition %3(%arg4, %arg5 : i64, i64)
} do {
^bb0(%arg4: i64, %arg5: i64):
%3 = cc.compute_ptr %arg0[%arg4] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
cc.store %arg5, %3 : !cc.ptr<i64>
cc.continue %arg4, %arg5 : i64, i64
} step {
^bb0(%arg4: i64, %arg5: i64):
%3 = arith.addi %arg4, %c1_i64 : i64
%4 = arith.addi %arg5, %arg3 : i64
cc.continue %3, %4 : i64, i64
} {invariant}
%2 = cc.stdvec_init %arg0, %0 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
return %2 : !cc.stdvec<i64>
})#"},

// Compute the total number of iterations, which is the value `n`, from a
// semi-open triple style notation. The set defined by the triple is `{ i,
// i+s, i+2*s, ... i+(n-1)*s }` where `i` is the initial value `%start`, `s`
// is the step, `%step`, and the value `i+(n-1)*s` is strictly in the
// interval `[start .. stop)` or `(stop .. start]` depending on whether step
// is positive or negative. Invalid triples, such as the step being zero or
// the lower and upper bounds being transposed will return a value of 0.
// Note that all three parameters are assumed to be signed values, which is
// required to have a decrementing loop.
{cudaq::getCudaqSizeFromTriple,
{},
R"#(
func.func private @__nvqpp_CudaqSizeFromTriple(%start: i64, %stop: i64, %step: i64) -> i64 {
%0 = arith.constant 0 : i64
%1 = arith.constant 1 : i64
%n1 = arith.constant -1 : i64
%c1 = arith.cmpi eq, %step, %0 : i64
cf.cond_br %c1, ^b1, ^exit(%0 : i64)
^b1:
%c2 = arith.cmpi sgt, %step, %0 : i64
%adjust = arith.select %c2, %1, %n1 : i64
%2 = arith.subi %stop, %adjust : i64
%3 = arith.subi %2, %start : i64
%4 = arith.addi %3, %step : i64
%5 = arith.divsi %4, %step : i64
%c3 = arith.cmpi sgt, %5, %0 : i64
cf.cond_br %c3, ^exit(%5 : i64), ^exit(%0 : i64)
^exit(%rv : i64):
return %rv : i64
})#"},

{"__nvqpp_createDynamicResult",
{cudaq::llvmMemCopyIntrinsic, "malloc"},
R"#(
Expand Down
6 changes: 3 additions & 3 deletions runtime/cudaq/builder/kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ std::vector<double> getAlphaZ(const std::span<double> data,
std::vector<double> getAlphaY(const std::span<double> data,
std::size_t numQubits, std::size_t k) {
std::vector<std::vector<std::size_t>> inNum, inDenom;
auto twoNmK = (1ULL << (numQubits - k)), twoK = (1ULL << k),
twoKmOne = (1ULL << (k - 1));
auto twoNmK = (1LL << (numQubits - k)), twoK = (1LL << k),
twoKmOne = (1LL << (k - 1));
for (auto j : cudaq::range(twoNmK)) {
std::vector<std::size_t> local;
for (auto l : cudaq::range(twoKmOne))
Expand Down Expand Up @@ -184,4 +184,4 @@ std::vector<double> getAlphaY(const std::span<double> data,

return res;
}
} // namespace cudaq::details
} // namespace cudaq::details
5 changes: 3 additions & 2 deletions runtime/cudaq/builder/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ template <typename Kernel>
void from_state(Kernel &&kernel, QuakeValue &qubits,
const std::span<std::complex<double>> data,
std::size_t inNumQubits = 0) {
auto numQubits = qubits.constantSize().value_or(inNumQubits);
if (numQubits == 0)
std::make_signed_t<std::size_t> numQubits =
qubits.constantSize().value_or(inNumQubits);
if (numQubits <= 0)
throw std::runtime_error(
"[from_state] cannot infer size of input quantum register, please "
"specify the number of qubits via the from_state() final argument.");
Expand Down
Loading

0 comments on commit edac569

Please sign in to comment.