Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds the ability to pass vector<bool> values to kernels as input #1404

Merged
merged 4 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions include/cudaq/Optimizer/Builder/Factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,15 @@ class PointerType;
class StructType;
} // namespace cc

namespace opt::factory {
namespace opt {

template <typename T>
requires std::integral<T>
T convertBitsToBytes(T bits) {
return (bits + 7) / 8;
}

namespace factory {

constexpr const char targetTripleAttrName[] = "llvm.triple";
constexpr const char targetDataLayoutAttrName[] = "llvm.data_layout";
Expand Down Expand Up @@ -225,5 +233,6 @@ bool isAArch64(mlir::ModuleOp);
/// the X86-64 ABI.) If \p ty is not a `struct`, this returns `false`.
bool structUsesTwoArguments(mlir::Type ty);

} // namespace opt::factory
} // namespace factory
} // namespace opt
} // namespace cudaq
2 changes: 2 additions & 0 deletions include/cudaq/Optimizer/Builder/Intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ static constexpr const char llvmMemCopyIntrinsic[] =
static constexpr const char setCudaqRangeVector[] = "__nvqpp_CudaqRangeInit";
static constexpr const char stdvecBoolCtorFromInitList[] =
"__nvqpp_initializer_list_to_vector_bool";
static constexpr const char stdvecBoolUnpackToInitList[] =
"__nvqpp_vector_bool_to_initializer_list";

/// Builder for lowering the clang AST to an IR for CUDA Quantum. Lowering
/// includes the transformation of both quantum and classical computation.
Expand Down
6 changes: 6 additions & 0 deletions lib/Optimizer/Builder/Intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ static constexpr IntrinsicCode intrinsicTable[] = {
return %0 : !cc.ptr<i8>
})#"},

// __nvqpp_vector_bool_to_initializer_list
{cudaq::stdvecBoolUnpackToInitList,
{},
R"#(
func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>) -> ())#"},

{"__nvqpp_zeroDynamicResult", {}, R"#(
func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
%c0_i64 = arith.constant 0 : i64
Expand Down
59 changes: 57 additions & 2 deletions lib/Optimizer/Transforms/GenKernelExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ class GenerateKernelExecution
// Process all the arguments for the original call by looping over the
// kernel's arguments.
bool hasTrailingData = false;
DenseMap<std::int32_t, Value> replacementArgs;
for (auto kaIter : llvm::enumerate(kernelArgTypes)) {
std::int32_t idx = kaIter.index();

Expand All @@ -324,7 +325,19 @@ class GenerateKernelExecution
// buffer's addendum (unless the vector is length 0).
auto ptrInTy = cudaq::cc::PointerType::get(
cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()));

Value arg = builder.create<cudaq::cc::CastOp>(loc, ptrInTy, argPtr);
if (stdvecTy.getElementType() == builder.getI1Type()) {
// Create a mock vector of i8 and populate the bools, 1 per char.
Value temp = builder.create<cudaq::cc::AllocaOp>(
loc, ptrInTy.getElementType());
builder.create<func::CallOp>(loc, std::nullopt,
cudaq::stdvecBoolUnpackToInitList,
ArrayRef<Value>{temp, arg});
replacementArgs[idx] = temp;
arg = temp;
}

auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
stVal = p1;
Expand Down Expand Up @@ -410,6 +423,20 @@ class GenerateKernelExecution
arg = builder.create<cudaq::cc::CastOp>(loc, ptrInTy, arg);
vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
vecToBuffer, ptrInTy);
if (stdvecTy.getElementType() == builder.getI1Type()) {
auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
assert(replacementArgs.count(idx) && "must be in map");
auto arg = replacementArgs[idx];
auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
ArrayRef<cudaq::cc::ComputePtrArg>{0, 0});
auto loadHeapPtr = builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
auto i8Ty = builder.getI8Type();
Value heapCast = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
builder.create<func::CallOp>(loc, std::nullopt, "free",
ArrayRef<Value>{heapCast});
}
} else if (auto strTy = dyn_cast<cudaq::cc::StructType>(currArgTy)) {
if (cudaq::cc::isDynamicType(strTy)) {
Value argPtrPtr = builder.create<cudaq::cc::ComputePtrOp>(
Expand Down Expand Up @@ -1023,6 +1050,8 @@ class GenerateKernelExecution
SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
rewriteEntryBlock->getArguments(), funcTy, addThisPtr)};
std::int32_t idx = 0;
SmallVector<Value> blockValues(blockArgs.size());
std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
++iter, ++idx) {
Value arg = *iter;
Expand All @@ -1042,6 +1071,16 @@ class GenerateKernelExecution
// Should the spec stipulate that pure device kernels must pass by
// read-only reference, i.e., take `const std::vector<T> &` arguments?
auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
// If this is a std::vector<bool>, unpack it.
if (stdvecTy.getElementType() == builder.getI1Type()) {
// Create a mock vector of i8 and populate the bools, 1 per char.
Value temp = builder.create<cudaq::cc::AllocaOp>(
loc, ptrInTy.getElementType());
builder.create<func::CallOp>(loc, std::nullopt,
cudaq::stdvecBoolUnpackToInitList,
ArrayRef<Value>{temp, arg});
arg = blockValues[idx] = temp;
}
// FIXME: call the `size` member function. For expediency, assume this
// is an std::vector and the size is the scaled delta between the
// first two pointers. Use the unscaled size for now.
Expand Down Expand Up @@ -1139,8 +1178,7 @@ class GenerateKernelExecution
Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
// Ignore any hidden `this` argument.
for (auto inp : llvm::enumerate(dropAnyHiddenArguments(
rewriteEntryBlock->getArguments(), funcTy, addThisPtr))) {
for (auto inp : llvm::enumerate(blockValues)) {
Value arg = inp.value();
Type inTy = arg.getType();
std::int32_t idx = inp.index();
Expand All @@ -1152,6 +1190,17 @@ class GenerateKernelExecution
auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
vecToBuffer, ptrInTy);
if (stdvecTy.getElementType() == builder.getI1Type()) {
auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
ArrayRef<cudaq::cc::ComputePtrArg>{0, 0});
auto loadHeapPtr = builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
Value heapCast = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
builder.create<func::CallOp>(loc, std::nullopt, "free",
ArrayRef<Value>{heapCast});
}
} else if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
if (cudaq::cc::isDynamicType(strTy))
vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
Expand Down Expand Up @@ -1305,6 +1354,12 @@ class GenerateKernelExecution
cudaq::stdvecBoolCtorFromInitList);
return;
}
if (failed(irBuilder.loadIntrinsic(module,
cudaq::stdvecBoolUnpackToInitList))) {
module.emitError(std::string("could not load ") +
cudaq::stdvecBoolUnpackToInitList);
return;
}
if (failed(irBuilder.loadIntrinsic(module, cudaq::llvmMemCopyIntrinsic))) {
module.emitError(std::string("could not load ") +
cudaq::llvmMemCopyIntrinsic);
Expand Down
15 changes: 10 additions & 5 deletions lib/Optimizer/Transforms/QuakeSynthesizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
#include "cudaq/Optimizer/Transforms/Passes.h"
#include "cudaq/Todo.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
Expand Down Expand Up @@ -412,7 +411,8 @@ class QuakeSynthesizer
// Process scalar floating point types.
if (type == builder.getF32Type()) {
synthesizeRuntimeArgument<float>(
builder, argument, args, offset, type.getIntOrFloatBitWidth() / 8,
builder, argument, args, offset,
cudaq::opt::convertBitsToBytes(type.getIntOrFloatBitWidth()),
[=](OpBuilder &builder, float *concrete) {
llvm::APFloat f(*concrete);
return builder.create<arith::ConstantFloatOp>(
Expand All @@ -422,7 +422,8 @@ class QuakeSynthesizer
}
if (type == builder.getF64Type()) {
synthesizeRuntimeArgument<double>(
builder, argument, args, offset, type.getIntOrFloatBitWidth() / 8,
builder, argument, args, offset,
cudaq::opt::convertBitsToBytes(type.getIntOrFloatBitWidth()),
[=](OpBuilder &builder, double *concrete) {
llvm::APFloat f(*concrete);
return builder.create<arith::ConstantFloatOp>(
Expand All @@ -443,7 +444,10 @@ class QuakeSynthesizer
char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
auto sizeFromBuffer =
*reinterpret_cast<std::uint64_t *>(ptrToSizeInBuffer);
auto vectorSize = sizeFromBuffer / (eleTy.getIntOrFloatBitWidth() / 8);
auto bytesInType =
cudaq::opt::convertBitsToBytes(eleTy.getIntOrFloatBitWidth());
assert(bytesInType > 0 && "element must have a size");
auto vectorSize = sizeFromBuffer / bytesInType;
stdVecInfo.emplace_back(argNum, eleTy, vectorSize);
continue;
}
Expand Down Expand Up @@ -508,7 +512,8 @@ class QuakeSynthesizer
doVector(std::int64_t{});
break;
default:
bufferAppendix += vecLength * (ty.getIntOrFloatBitWidth() / 8);
bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes(
ty.getIntOrFloatBitWidth());
funcOp.emitOpError("synthesis failed for vector<integral-type>.");
break;
}
Expand Down
21 changes: 21 additions & 0 deletions runtime/cudaq/cudaq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,5 +432,26 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &result,
// Free the initialization list, which was heap allocated.
free(initList);
}

/// Construct a block of 0 and 1 bytes that corresponds to the `vector<bool>`
/// values. This gets rid of the bit packing implementation of the
/// `std::vector<bool>` overload. The conversion turns the `std::vector<bool>`
/// into a mock vector structure that looks like `std::vector<char>`. The
/// calling routine must cleanup the buffer allocated by this code.
void __nvqpp_vector_bool_to_initializer_list(void *outData,
const std::vector<bool> &inVec) {
// The MockVector must be allocated by the caller.
struct MockVector {
char *start;
char *end;
};
MockVector *mockVec = reinterpret_cast<MockVector *>(outData);
auto outSize = inVec.size();
// The buffer allocated here must be freed by the caller.
mockVec->start = static_cast<char *>(malloc(outSize));
mockVec->end = mockVec->start + outSize;
for (unsigned i = 0; i < outSize; ++i)
(mockVec->start)[i] = static_cast<char>(inVec[i]);
}
}
} // namespace cudaq::support
7 changes: 5 additions & 2 deletions runtime/cudaq/qis/qubit_qis.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,11 @@ std::vector<measure_result> mz(qubit &q, Qs &&...qs) {

namespace support {
// Helper to initialize a `vector<bool>` data structure.
extern "C" void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &,
char *, std::size_t);
extern "C" {
void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &, char *,
std::size_t);
void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector<bool> &);
}
} // namespace support

// Measure the state in the given spin_op basis.
Expand Down
Loading