NVIDIA · schweitzpgi · Apr 8, 2024 · Apr 3, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
@@ -197,6 +197,19 @@ createInvariantLoop(mlir::OpBuilder &builder, mlir::Location loc,
                                             mlir::Region &, mlir::Block &)>
                         bodyBuilder);
 
+/// Builds a monotonic loop. A monotonic loop is a loop that is guaranteed to
+/// execute the body of the loop from \p start to (but not including) \p stop
+/// stepping by \p step times. Exceptional conditions will cause the loop body
+/// to execute 0 times. Early exits are not allowed. This builder threads the
+/// loop control value, which will be returned as the value \p stop (or the next
+/// value near \p stop) when the loop exits.
+cc::LoopOp
+createMonotonicLoop(mlir::OpBuilder &builder, mlir::Location loc,
+                    mlir::Value start, mlir::Value stop, mlir::Value step,
+                    llvm::function_ref<void(mlir::OpBuilder &, mlir::Location,
+                                            mlir::Region &, mlir::Block &)>
+                        bodyBuilder);
+
 bool hasHiddenSRet(mlir::FunctionType funcTy);
 
 /// Convert the function type \p funcTy to a signature compatible with the code

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -14,9 +14,23 @@ namespace cudaq {
 
 static constexpr const char llvmMemCopyIntrinsic[] =
     "llvm.memcpy.p0i8.p0i8.i64";
+
+// cudaq::range(count);
 static constexpr const char setCudaqRangeVector[] = "__nvqpp_CudaqRangeInit";
+// cudaq::range(start, stop, step);
+static constexpr const char setCudaqRangeVectorTriple[] =
+    "__nvqpp_CudaqRangeInitTriple";
+// Computes the number of iterations as from a semi-open interval as given by a
+// cudaq::range() triple.
+static constexpr const char getCudaqSizeFromTriple[] =
+    "__nvqpp_CudaqSizeFromTriple";
+
+// Convert a sequence of booleans (as bytes) into a std::vector<bool> (which is
+// typically specialized to be bit packed).
 static constexpr const char stdvecBoolCtorFromInitList[] =
     "__nvqpp_initializer_list_to_vector_bool";
+// Convert a (likely packed) std::vector<bool> into a sequence of bytes, each
+// holding a boolean value.
 static constexpr const char stdvecBoolUnpackToInitList[] =
     "__nvqpp_vector_bool_to_initializer_list";
 

diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -1873,17 +1873,38 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       auto *block = builder.getBlock();
       IRBuilder irBuilder(builder.getContext());
       auto mod = block->getParentOp()->getParentOfType<ModuleOp>();
+      auto i64Ty = builder.getI64Type(); // element type
+      if (funcArity == 1) {
+        [[maybe_unused]] auto result =
+            irBuilder.loadIntrinsic(mod, setCudaqRangeVector);
+        assert(succeeded(result) && "loading intrinsic should never fail");
+        auto upVal = args[0];
+        auto upper = builder.create<cc::CastOp>(loc, i64Ty, upVal,
+                                                cc::CastOpMode::Unsigned);
+        auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, upper);
+        auto stdvecTy = cc::StdvecType::get(i64Ty);
+        auto call = builder.create<func::CallOp>(
+            loc, stdvecTy, setCudaqRangeVector, ValueRange{buffer, upper});
+        return pushValue(call.getResult(0));
+      }
+      assert(funcArity == 3);
       [[maybe_unused]] auto result =
-          irBuilder.loadIntrinsic(mod, setCudaqRangeVector);
+          irBuilder.loadIntrinsic(mod, setCudaqRangeVectorTriple);
       assert(succeeded(result) && "loading intrinsic should never fail");
-      auto upVal = args[0];
-      auto i64Ty = builder.getI64Type(); // element type
-      auto upper = builder.create<cc::CastOp>(loc, i64Ty, upVal,
-                                              cc::CastOpMode::Unsigned);
-      auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, upper);
+      Value start = builder.create<cc::CastOp>(loc, i64Ty, args[0],
+                                               cc::CastOpMode::Signed);
+      Value stop = builder.create<cc::CastOp>(loc, i64Ty, args[1],
+                                              cc::CastOpMode::Signed);
+      Value step = builder.create<cc::CastOp>(loc, i64Ty, args[2],
+                                              cc::CastOpMode::Signed);
+      auto lengthCall = builder.create<func::CallOp>(
+          loc, i64Ty, getCudaqSizeFromTriple, ValueRange{start, stop, step});
+      Value length = lengthCall.getResult(0);
+      auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, length);
       auto stdvecTy = cc::StdvecType::get(i64Ty);
-      auto call = builder.create<func::CallOp>(
-          loc, stdvecTy, setCudaqRangeVector, ValueRange{buffer, upper});
+      auto call =
+          builder.create<func::CallOp>(loc, stdvecTy, setCudaqRangeVectorTriple,
+                                       ValueRange{buffer, start, stop, step});
       return pushValue(call.getResult(0));
     }
 

diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -134,8 +134,9 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
     auto eleTy = stdvecTy.getElementType();
     auto dataPtrTy = cc::PointerType::get(eleTy);
     auto dataArrPtrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-    auto [iters, ptr] = [&]() -> std::pair<Value, Value> {
-      if (auto call = buffer.getDefiningOp<func::CallOp>())
+    auto [iters, ptr, initial,
+          stepBy] = [&]() -> std::tuple<Value, Value, Value, Value> {
+      if (auto call = buffer.getDefiningOp<func::CallOp>()) {
         if (call.getCallee().equals(setCudaqRangeVector)) {
           // The std::vector was produced by cudaq::range(). Optimize this
           // special case to use the loop control directly. Erase the transient
@@ -148,11 +149,30 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
             // shouldn't get here, but we can erase the call at minimum
             call->erase();
           }
-          return {i, {}};
+          return {i, {}, {}, {}};
+        } else if (call.getCallee().equals(setCudaqRangeVectorTriple)) {
+          Value i = call.getOperand(2);
+          if (auto alloc = call.getOperand(0).getDefiningOp<cc::AllocaOp>()) {
+            Operation *callGetSizeOp = nullptr;
+            if (auto seqSize = alloc.getSeqSize()) {
+              if (auto callSize = seqSize.getDefiningOp<func::CallOp>())
+                if (callSize.getCallee().equals(getCudaqSizeFromTriple))
+                  callGetSizeOp = callSize.getOperation();
+            }
+            call->erase(); // erase call must be first
+            alloc->erase();
+            if (callGetSizeOp)
+              callGetSizeOp->erase();
+          } else {
+            // shouldn't get here, but we can erase the call at minimum
+            call->erase();
+          }
+          return {i, {}, call.getOperand(1), call.getOperand(3)};
         }
+      }
       Value i = builder.create<cc::StdvecSizeOp>(loc, i64Ty, buffer);
       Value p = builder.create<cc::StdvecDataOp>(loc, dataArrPtrTy, buffer);
-      return {i, p};
+      return {i, p, {}, {}};
     }();
 
     auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region,
@@ -196,9 +216,16 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
       builder.create<cc::ScopeOp>(loc, scopeBuilder);
     };
 
-    auto idxIters = builder.create<cudaq::cc::CastOp>(
-        loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
-    opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
+    if (!initial) {
+      auto idxIters = builder.create<cudaq::cc::CastOp>(
+          loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
+      opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
+    } else {
+      auto idxIters = builder.create<cudaq::cc::CastOp>(
+          loc, i64Ty, iters, cudaq::cc::CastOpMode::Signed);
+      opt::factory::createMonotonicLoop(builder, loc, initial, idxIters, stepBy,
+                                        bodyBuilder);
+    }
   } else if (auto veqTy = dyn_cast<quake::VeqType>(buffer.getType());
              veqTy && veqTy.hasSpecifiedSize()) {
     Value iters =

diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "cudaq/Optimizer/Builder/Factory.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
@@ -175,24 +175,21 @@ cc::LoopOp factory::createInvariantLoop(
   auto loop = builder.create<cc::LoopOp>(
       loc, resultTys, inputs, /*postCondition=*/false,
       [&](OpBuilder &builder, Location loc, Region &region) {
-        cc::RegionBuilderGuard guard(builder, loc, region,
-                                     TypeRange{zero.getType()});
+        cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
         Value cmpi = builder.create<arith::CmpIOp>(
             loc, arith::CmpIPredicate::slt, block.getArgument(0),
             totalIterations);
         builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
-        cc::RegionBuilderGuard guard(builder, loc, region,
-                                     TypeRange{zero.getType()});
+        cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
         bodyBuilder(builder, loc, region, block);
         builder.create<cc::ContinueOp>(loc, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
-        cc::RegionBuilderGuard guard(builder, loc, region,
-                                     TypeRange{zero.getType()});
+        cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
         auto incr =
             builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
@@ -202,6 +199,63 @@ cc::LoopOp factory::createInvariantLoop(
   return loop;
 }
 
+// This builder will transform the monotonic loop into an invariant loop during
+// construction. This is meant to save some time in loop analysis and
+// normalization, which would perform a similar transformation.
+cc::LoopOp factory::createMonotonicLoop(
+    OpBuilder &builder, Location loc, Value start, Value stop, Value step,
+    llvm::function_ref<void(OpBuilder &, Location, Region &, Block &)>
+        bodyBuilder) {
+  IRBuilder irBuilder(builder.getContext());
+  auto mod = builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
+  [[maybe_unused]] auto loadedIntrinsic =
+      irBuilder.loadIntrinsic(mod, getCudaqSizeFromTriple);
+  assert(succeeded(loadedIntrinsic) && "loading intrinsic should never fail");
+  auto i64Ty = builder.getI64Type();
+  Value begin =
+      builder.create<cc::CastOp>(loc, i64Ty, start, cc::CastOpMode::Signed);
+  Value stepBy =
+      builder.create<cc::CastOp>(loc, i64Ty, step, cc::CastOpMode::Signed);
+  Value end =
+      builder.create<cc::CastOp>(loc, i64Ty, stop, cc::CastOpMode::Signed);
+  Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+  SmallVector<Value> inputs = {zero, begin};
+  SmallVector<Type> resultTys = {i64Ty, i64Ty};
+  auto totalIters = builder.create<func::CallOp>(
+      loc, i64Ty, getCudaqSizeFromTriple, ValueRange{begin, end, stepBy});
+  auto loop = builder.create<cc::LoopOp>(
+      loc, resultTys, inputs, /*postCondition=*/false,
+      [&](OpBuilder &builder, Location loc, Region &region) {
+        cc::RegionBuilderGuard guard(builder, loc, region,
+                                     TypeRange{i64Ty, i64Ty});
+        auto &block = *builder.getBlock();
+        Value cmpi = builder.create<arith::CmpIOp>(
+            loc, arith::CmpIPredicate::slt, block.getArgument(0),
+            totalIters.getResult(0));
+        builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
+      },
+      [&](OpBuilder &builder, Location loc, Region &region) {
+        cc::RegionBuilderGuard guard(builder, loc, region,
+                                     TypeRange{i64Ty, i64Ty});
+        auto &block = *builder.getBlock();
+        bodyBuilder(builder, loc, region, block);
+        builder.create<cc::ContinueOp>(loc, block.getArguments());
+      },
+      [&](OpBuilder &builder, Location loc, Region &region) {
+        cc::RegionBuilderGuard guard(builder, loc, region,
+                                     TypeRange{i64Ty, i64Ty});
+        auto &block = *builder.getBlock();
+        auto one = builder.create<arith::ConstantIntOp>(loc, 1, 64);
+        Value count =
+            builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
+        Value incr =
+            builder.create<arith::AddIOp>(loc, block.getArgument(1), stepBy);
+        builder.create<cc::ContinueOp>(loc, ValueRange{count, incr});
+      });
+  loop->setAttr("invariant", builder.getUnitAttr());
+  return loop;
+}
+
 // FIXME: some ABIs may return a small struct in registers rather than via an
 // sret pointer.
 //

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -67,11 +67,77 @@ static constexpr IntrinsicCode intrinsicTable[] = {
         %one = arith.constant 1 : i64
         %s1 = arith.addi %i, %one : i64
         cc.continue %s1 : i64
-    }
+    } {invariant}
     %2 = cc.stdvec_init %arg0, %arg1 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
     return %2 : !cc.stdvec<i64>
   })#"},
 
+    // Compute and initialize a vector from a semi-open triple style notation.
+    // The vector returned will contain the ordered set defined by the triple.
+    // That set is specifically `{ i, i+s, i+2*s, ... i+(n-1)*s }` where `i` is
+    // the initial value `%arg1`, `s` is the step, `%arg3`, and the value
+    // `i+(n-1)*s` is strictly in the interval `[%arg1 .. %arg2)` or `(%arg2 ..
+    // %arg1]` depending on whether `%arg3` is positive or negative. Invalid
+    // triples, such as the step being zero or the lower and upper bounds being
+    // transposed will return a vector of length 0 (an empty set). Note that all
+    // three parameters are assumed to be signed values, which is required to
+    // have a decrementing loop.
+    {cudaq::setCudaqRangeVectorTriple,
+     {cudaq::getCudaqSizeFromTriple},
+     R"#(
+  func.func private @__nvqpp_CudaqRangeInitTriple(%arg0: !cc.ptr<!cc.array<i64 x ?>>, %arg1: i64, %arg2: i64, %arg3: i64) -> !cc.stdvec<i64> {
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %0 = call @__nvqpp_CudaqSizeFromTriple(%arg1, %arg2, %arg3) : (i64, i64, i64) -> i64
+    %1:2 = cc.loop while ((%arg4 = %c0_i64, %arg5 = %arg1) -> (i64, i64)) {
+      %3 = arith.cmpi ult, %arg4, %0 : i64
+      cc.condition %3(%arg4, %arg5 : i64, i64)
+    } do {
+    ^bb0(%arg4: i64, %arg5: i64):
+      %3 = cc.compute_ptr %arg0[%arg4] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+      cc.store %arg5, %3 : !cc.ptr<i64>
+      cc.continue %arg4, %arg5 : i64, i64
+    } step {
+    ^bb0(%arg4: i64, %arg5: i64):
+      %3 = arith.addi %arg4, %c1_i64 : i64
+      %4 = arith.addi %arg5, %arg3 : i64
+      cc.continue %3, %4 : i64, i64
+    } {invariant}
+    %2 = cc.stdvec_init %arg0, %0 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
+    return %2 : !cc.stdvec<i64>
+  })#"},
+
+    // Compute the total number of iterations, which is the value `n`, from a
+    // semi-open triple style notation. The set defined by the triple is `{ i,
+    // i+s, i+2*s, ... i+(n-1)*s }` where `i` is the initial value `%start`, `s`
+    // is the step, `%step`, and the value `i+(n-1)*s` is strictly in the
+    // interval `[start .. stop)` or `(stop .. start]` depending on whether step
+    // is positive or negative. Invalid triples, such as the step being zero or
+    // the lower and upper bounds being transposed will return a value of 0.
+    // Note that all three parameters are assumed to be signed values, which is
+    // required to have a decrementing loop.
+    {cudaq::getCudaqSizeFromTriple,
+     {},
+     R"#(
+  func.func private @__nvqpp_CudaqSizeFromTriple(%start: i64, %stop: i64, %step: i64) -> i64 {
+    %0 = arith.constant 0 : i64
+    %1 = arith.constant 1 : i64
+    %n1 = arith.constant -1 : i64
+    %c1 = arith.cmpi eq, %step, %0 : i64
+    cf.cond_br %c1, ^b1, ^exit(%0 : i64)
+   ^b1:
+    %c2 = arith.cmpi sgt, %step, %0 : i64
+    %adjust = arith.select %c2, %1, %n1 : i64
+    %2 = arith.subi %stop, %adjust : i64
+    %3 = arith.subi %2, %start : i64
+    %4 = arith.addi %3, %step : i64
+    %5 = arith.divsi %4, %step : i64
+    %c3 = arith.cmpi sgt, %5, %0 : i64
+    cf.cond_br %c3, ^exit(%5 : i64), ^exit(%0 : i64)
+   ^exit(%rv : i64):
+    return %rv : i64
+  })#"},
+
     {"__nvqpp_createDynamicResult",
      {cudaq::llvmMemCopyIntrinsic, "malloc"},
      R"#(

diff --git a/runtime/cudaq/builder/kernels.cpp b/runtime/cudaq/builder/kernels.cpp
@@ -140,8 +140,8 @@ std::vector<double> getAlphaZ(const std::span<double> data,
 std::vector<double> getAlphaY(const std::span<double> data,
                               std::size_t numQubits, std::size_t k) {
   std::vector<std::vector<std::size_t>> inNum, inDenom;
-  auto twoNmK = (1ULL << (numQubits - k)), twoK = (1ULL << k),
-       twoKmOne = (1ULL << (k - 1));
+  auto twoNmK = (1LL << (numQubits - k)), twoK = (1LL << k),
+       twoKmOne = (1LL << (k - 1));
   for (auto j : cudaq::range(twoNmK)) {
     std::vector<std::size_t> local;
     for (auto l : cudaq::range(twoKmOne))
@@ -184,4 +184,4 @@ std::vector<double> getAlphaY(const std::span<double> data,
 
   return res;
 }
-} // namespace cudaq::details
+} // namespace cudaq::details
diff --git a/runtime/cudaq/builder/kernels.h b/runtime/cudaq/builder/kernels.h
@@ -62,8 +62,9 @@ template <typename Kernel>
 void from_state(Kernel &&kernel, QuakeValue &qubits,
                 const std::span<std::complex<double>> data,
                 std::size_t inNumQubits = 0) {
-  auto numQubits = qubits.constantSize().value_or(inNumQubits);
-  if (numQubits == 0)
+  std::make_signed_t<std::size_t> numQubits =
+      qubits.constantSize().value_or(inNumQubits);
+  if (numQubits <= 0)
     throw std::runtime_error(
         "[from_state] cannot infer size of input quantum register, please "
         "specify the number of qubits via the from_state() final argument.");