Mu-L · pull · Oct 21, 2020 · Oct 21, 2020 · Oct 21, 2020 · Oct 21, 2020
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -61,10 +61,10 @@ LogicalResult Verify(OpTy op) {
 }
 
 //===----------------------------------------------------------------------===//
-// AllocRawOp
+// TFAllocOp
 //===----------------------------------------------------------------------===//
 template <>
-LogicalResult Verify<AllocRawOp>(AllocRawOp op) {
+LogicalResult Verify<TFAllocOp>(TFAllocOp op) {
   // Check that the total number of operands matches the number of dynamic
   // dimensions specified in the memref type.
   unsigned result_dyn_dims = op.getType().getNumDynamicDims();

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -49,21 +49,28 @@ class TFFramework_Op<string mnemonic, list<OpTrait> traits = []> :
 }
 
 //===----------------------------------------------------------------------===//
-// AllocRawOp
+// TFAllocOp
 //===----------------------------------------------------------------------===//
-def TFFramework_AllocRawOp : TFFramework_Op<"alloc_raw",
+def TFFramework_TFAllocOp : TFFramework_Op<"alloc",
     [MemoryEffects<[MemAlloc<DefaultResource>]>]> {
   let summary = "allocation of tensors that uses TF Framework";
   let description = [{
     Allocation of tensors during kernel execution in the Compute method.
 
-    This should be used to allocate any temporary or output memref.
-    Corresponds to `Allocator::AllocateRaw` in
-    tensorflow/core/framework/allocator.h.
+    This should be used to allocate any temporary or output memref. If
+    `output_index` and `input_indices` are given, attempts to forward one of
+    the input tensors to the output by calling `OpKernelContext::forward_input`.
+
+    If the attributes are missing or the forwarding fails, calls
+    `Allocator::AllocateRaw` in tensorflow/core/framework/allocator.h.
   }];
 
-  let arguments = (ins TFFramework_OpKernelContextType:$ctx,
-                   Variadic<Index>:$dyn_sizes);
+  let arguments = (ins
+    TFFramework_OpKernelContextType:$ctx,
+    Variadic<Index>:$dyn_sizes,
+    OptionalAttr<I32ArrayAttr>:$input_indices,
+    OptionalAttr<I32Attr>:$output_index
+  );
   let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$result);
 
   let builders = [
@@ -92,16 +99,16 @@ def TFFramework_AllocRawOp : TFFramework_Op<"alloc_raw",
 }
 
 //===----------------------------------------------------------------------===//
-// DeallocRawOp
+// TFDeallocOp
 //===----------------------------------------------------------------------===//
-def TFFramework_DeallocRawOp : TFFramework_Op<"dealloc_raw",
+def TFFramework_TFDeallocOp : TFFramework_Op<"dealloc",
     [MemoryEffects<[MemFree]>]> {
   let summary = "deallocation of tensors that uses TF Framework";
   let description = [{
     Deallocation of tensors during kernel execution in the Compute method.
 
     This should be used to deallocate any temporary memref that was allocated
-    with `tf_framework.alloc_raw`.
+    with `tf_framework.alloc`.
     Corresponds to `Allocator::DeallocateRaw` in
     tensorflow/core/framework/allocator.h.
   }];

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
@@ -10,9 +10,9 @@ func @tf_entry(%size_0 : index , %size_2 : index) -> index
   dealloc %buf : memref<?x10x?xf32>
   std.return %size_0 : index
 }
-// CHECK-NEXT: [[VAL_3:%.*]] = tf_framework.alloc_raw
+// CHECK-NEXT: [[VAL_3:%.*]] = tf_framework.alloc
 // CHECK-SAME:   ([[CTX]], [[SIZE_0]], [[SIZE_2]]) : memref<?x10x?xf32>
-// CHECK-NEXT: tf_framework.dealloc_raw([[CTX]], [[VAL_3]]) : memref<?x10x?xf32>
+// CHECK-NEXT: tf_framework.dealloc([[CTX]], [[VAL_3]]) : memref<?x10x?xf32>
 // CHECK-NEXT: return [[SIZE_0]] : index
 
 // -----

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
@@ -2,6 +2,6 @@
 
 func @alloc_raw(%ctx: !tf_framework.op_kernel_context, %size : index) {
   // expected-error @+1 {{`dyn_sizes` count 1 does not match dynamic dimensions}}
-  %buf = tf_framework.alloc_raw(%ctx, %size) : memref<?x10x?xi8>
+  %buf = tf_framework.alloc(%ctx, %size) : memref<?x10x?xi8>
   return
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
@@ -4,17 +4,28 @@
 // Verify the generic form can be parsed.
 // RUN: kernel-gen-opt -mlir-print-op-generic %s | kernel-gen-opt | FileCheck %s
 
-// CHECK-LABEL: func @alloc_raw
-func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+// CHECK-LABEL: func @alloc
+func @alloc(%ctx: !tf_framework.op_kernel_context,
                    %size_0 : index , %size_2 : index) {
-  %buf_0 = tf_framework.alloc_raw(%ctx) : memref<10xi8>
-  %buf_1 = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xi8>
+  %buf_0 = tf_framework.alloc(%ctx) : memref<10xi8>
+  %buf_1 = tf_framework.alloc(%ctx, %size_0, %size_2) : memref<?x10x?xi8>
   return
 }
 
-// CHECK-LABEL: func @dealloc_raw
-func @dealloc_raw(%ctx: !tf_framework.op_kernel_context, %memref : memref<?x10xf32>) {
-  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+// CHECK-LABEL: func @forwarding_alloc
+func @forwarding_alloc(%ctx: !tf_framework.op_kernel_context,
+                       %size_0 : index , %size_2 : index) {
+  %buf = tf_framework.alloc(%ctx, %size_0, %size_2) {
+    input_indices = [0 : i32, 1 : i32],
+    output_index = 0 : i32
+  } : memref<?x10x?xi8>
+  return
+}
+
+// CHECK-LABEL: func @dealloc
+func @dealloc(%ctx: !tf_framework.op_kernel_context,
+              %memref : memref<?x10xf32>) {
+  tf_framework.dealloc(%ctx, %memref) : memref<?x10xf32>
   return
 }
 

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -1,15 +1,15 @@
-// RUN: kernel-gen-opt %s -tf-kernel-to-llvm -split-input-file | FileCheck %s
+// RUN: kernel-gen-opt %s -tf-kernel-to-llvm -split-input-file --print-ir-after-all | FileCheck %s
 
-// CHECK: llvm.func @_mlir_ciface_tf_alloc_raw
-// CHECK-SAME:  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+// CHECK: llvm.func @_mlir_ciface_tf_alloc
+// CHECK-SAME:  (!llvm.ptr<i8>, !llvm.i64, !llvm.i32, !llvm.i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
 
-// CHECK-LABEL: llvm.func @alloc_raw(
+// CHECK-LABEL: llvm.func @alloc(
 // CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
 // CHECK-SAME:    [[SIZE_0:%.*]]: !llvm.i64,
 // CHECK-SAME:    [[SIZE_2:%.*]]: !llvm.i64) -> [[DESC_TY:!.*]] {
-func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+func @alloc(%ctx: !tf_framework.op_kernel_context,
                 %size_0 : index , %size_2 : index) -> memref<?x10x?xf32> {
-  %buf = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xf32>
+  %buf = tf_framework.alloc(%ctx, %size_0, %size_2) : memref<?x10x?xf32>
   std.return %buf : memref<?x10x?xf32>
 }
 // Compute number of elements.
@@ -25,10 +25,19 @@ func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[SIZE_OF_FLOAT:%.*]] = llvm.ptrtoint [[GEP]]
 // CHECK-SAME:            !llvm.ptr<float> to !llvm.i64
 
-// Allocate memory.
+// Compute total size in bytes.
 // CHECK: [[NUM_BYTES:%.*]] = llvm.mul [[NUM_ELEM_1]], [[SIZE_OF_FLOAT]]
-// CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_BYTES]])
-// CHECK-SAME:                  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+
+// Compute output index (-1) and candidate indices (0, NULL).
+// CHECK: [[OUTPUT_INDEX:%.*]] = llvm.mlir.constant(-1 : i32) : !llvm.i32
+// CHECK-NEXT: [[NUM_CANDIDATES:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.null : !llvm.ptr<i32>
+
+// Allocate memory.
+// CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_BYTES]],
+// CHECK-SAME: [[OUTPUT_INDEX]], [[NUM_CANDIDATES]], [[CANDIDATES_PTR]])
+// CHECK-SAME: (!llvm.ptr<i8>, !llvm.i64, !llvm.i32, !llvm.i32, !llvm.ptr<i32>
+// CHECK-SAME: ) -> !llvm.ptr<i8>
 
 // Build memref descriptor.
 // CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : [[DESC_TY]]
@@ -55,13 +64,13 @@ func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
 
 // -----
 
-// CHECK: llvm.func @_mlir_ciface_tf_dealloc_raw(!llvm.ptr<i8>, !llvm.ptr<i8>)
+// CHECK: llvm.func @_mlir_ciface_tf_dealloc(!llvm.ptr<i8>, !llvm.ptr<i8>)
 
-// CHECK-LABEL: llvm.func @dealloc_raw(
+// CHECK-LABEL: llvm.func @dealloc(
 // CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
-func @dealloc_raw(%ctx: !tf_framework.op_kernel_context,
+func @dealloc(%ctx: !tf_framework.op_kernel_context,
                   %memref : memref<?x10xf32>) {
-  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+  tf_framework.dealloc(%ctx, %memref) : memref<?x10xf32>
   return
 }
 // Extract allocated ptr from the memref descriptor.
@@ -71,5 +80,5 @@ func @dealloc_raw(%ctx: !tf_framework.op_kernel_context,
 // CHECK-SAME:                   !llvm.ptr<float> to !llvm.ptr<i8>
 
 // Deallocate.
-// CHECK: llvm.call @_mlir_ciface_tf_dealloc_raw(
+// CHECK: llvm.call @_mlir_ciface_tf_dealloc(
 // CHECK-SAME: [[TF_CTX]], [[VOID_PTR]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -24,23 +24,49 @@ namespace tf_framework {
 namespace {
 
 using tensorflow::Allocator;
+using tensorflow::AllocatorAttributes;
 
 Allocator* GetAllocator(void* op_kernel_ctx) {
   auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
   // TODO(pifon): Figure out how to set AllocatorAttributes correctly.
-  tensorflow::AllocatorAttributes attrs;
+  AllocatorAttributes attrs;
   return ctx->get_allocator(attrs);
 }
 
 }  // namespace
 
-extern "C" void* _mlir_ciface_tf_alloc_raw(void* op_kernel_ctx,
-                                           size_t num_bytes) {
+extern "C" void* _mlir_ciface_tf_alloc(void* op_kernel_ctx, size_t num_bytes,
+                                       int32_t output_index,
+                                       int32_t num_candidates,
+                                       int32_t* candidate_input_indices) {
+  auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
+
+  if (output_index != -1) {
+    auto element_size = ctx->expected_output_dtype(output_index);
+    // Create a 1D shape, because the shapes don't have to match exactly for
+    // input forwarding. Only the number of elements must be the same.
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(num_bytes / element_size);
+
+    // Iterate over indices of all inputs that can potentially be used for
+    // forwarding.
+    for (int i = 0; i < num_candidates; ++i) {
+      // TODO(pifon): Expose fetching AllocatorAttributes with the output_index.
+      AllocatorAttributes output_attr;
+      auto tensor = ctx->forward_input(
+          candidate_input_indices[i], output_index, element_size, output_shape,
+          ctx->output_memory_type(output_index), output_attr);
+      if (tensor != nullptr) {
+        return tensor->data();
+      }
+    }
+  }
+  // If no forwarding happened, allocate a chunk of memory.
   return GetAllocator(op_kernel_ctx)
       ->AllocateRaw(Allocator::kAllocatorAlignment, num_bytes);
 }
 
-extern "C" void _mlir_ciface_tf_dealloc_raw(void* op_kernel_ctx, void* ptr) {
+extern "C" void _mlir_ciface_tf_dealloc(void* op_kernel_ctx, void* ptr) {
   GetAllocator(op_kernel_ctx)->DeallocateRaw(ptr);
 }
 

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -22,10 +22,11 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc_raw(
-    void* op_kernel_ctx, size_t num_bytes);
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc(
+    void* op_kernel_ctx, size_t num_bytes, int32_t output_index,
+    int32_t num_candidates, int32_t* candidate_input_indices);
 
-extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc_raw(
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc(
     void* op_kernel_ctx, void* ptr);
 
 }  // namespace tf_framework

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -75,19 +75,19 @@ class AllocOpConverter : public OpConversionPattern<AllocOp> {
       return failure();
     }
     // Symbolic operands that bind to the symbols of the memref's layout map are
-    // not supported by AllocRawOp.
+    // not supported by TFAllocOp.
     if (alloc.getNumSymbolicOperands() != 0) {
       return failure();
     }
-    rewriter.replaceOpWithNewOp<AllocRawOp>(alloc, alloc.getType(), ctx,
-                                            operands);
+    rewriter.replaceOpWithNewOp<TFAllocOp>(alloc, alloc.getType(), ctx,
+                                           operands);
     return success();
   }
 };
 
 // Converts std.dealloc to tf_framework.dealloc_raw using OpKernelContextType
 // arg of the parent function.
-class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
+class TFDeallocOpConverter : public OpConversionPattern<DeallocOp> {
  public:
   using OpConversionPattern<DeallocOp>::OpConversionPattern;
 
@@ -108,8 +108,8 @@ class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
       return failure();
     }
     DeallocOp::Adaptor transformed(operands);
-    rewriter.replaceOpWithNewOp<DeallocRawOp>(dealloc, ctx,
-                                              transformed.memref());
+    rewriter.replaceOpWithNewOp<TFDeallocOp>(dealloc, ctx,
+                                             transformed.memref());
     return success();
   }
 };
@@ -118,7 +118,7 @@ class DeallocOpConverter : public OpConversionPattern<DeallocOp> {
 
 void PopulateEmbedTFFrameworkConversionPatterns(
     MLIRContext *context, OwningRewritePatternList *patterns) {
-  patterns->insert<AllocOpConverter, DeallocOpConverter, FuncOpConverter>(
+  patterns->insert<AllocOpConverter, TFDeallocOpConverter, FuncOpConverter>(
       context);
 }