Add AsyncOut support for XLA.

This speeds up the multi-gpu training with Horovod by asynchronously triggering the outputs of XLA clusters fed to the HorovodAllreduce nodes. The feature is currently off by default; turn it on by setting TF_XLA_FLAGS="--tf_xla_auto_jit=1 --tf_xla_async_io_level=1". Some data points on 8GPUs on DGX-1, XLA-Async vs. XLA-Sync (i.e., before this commit): 13% perf gain on BERT large pretrain squad fp32, BatchSize=2 7% perf gain on Unet medical trainbench fp32 Design Doc: https://docs.google.com/document/d/1oohJC3BgQYmCb0njAf1Iqd1MShg4cPSeu9ZelnOFY8c/edit Authors: Trent Lo and Ayan Moitra
NVIDIA · Jun 30, 2020 · 9932ec3 · 9932ec3
1 parent fe667c6
commit 9932ec3
Show file tree

Hide file tree

Showing 53 changed files with 2,115 additions and 3 deletions.
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
@@ -513,6 +513,7 @@ cc_library(
         "mark_for_compilation_pass_test_helper.cc",
         "partially_decluster_pass.cc",
         "report_clustering_info_pass.cc",
+        "async_io_conversion_pass.cc",
     ],
     hdrs = [
         "build_xla_ops_pass.h",
@@ -528,6 +529,7 @@ cc_library(
         "mark_for_compilation_pass_test_helper.h",
         "partially_decluster_pass.h",
         "report_clustering_info_pass.h",
+        "async_io_conversion_pass.h",
     ],
     deps = [
         "compilability_check_util",
@@ -547,6 +549,7 @@ cc_library(
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/ops:xla_ops",
+        "//tensorflow/compiler/jit/ops:async_io_ops",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -680,6 +683,7 @@ tf_cc_test(
     name = "compilation_passes_test",
     size = "small",
     srcs = [
+        "async_io_conversion_pass_test.cc",
         "build_xla_ops_pass_test.cc",
         "clone_constants_for_better_clustering_test.cc",
         "cluster_scoping_pass_test.cc",