feat(trtorchc): Embedding engines in modules from the CLI

narendasan · narendasan · commit 2b4b9e358982 · 2021-07-28T10:34:11.000-07:00
Signed-off-by: Naren Dasan &lt;naren@narendasan.com&gt;
Signed-off-by: Naren Dasan &lt;narens@nvidia.com&gt;
diff --git a/cpp/trtorchc/README.md b/cpp/trtorchc/README.md
@@ -57,6 +57,10 @@ trtorchc [input_file_path] [output_file_path]
       --calibration-cache-file=[file_path]
                                         Path to calibration cache file to use
                                         for post training quantization
+      --embed-engine                    Whether to treat input file as a
+                                        serialized TensorRT engine and embed it
+                                        into a TorchScript module (device spec
+                                        must be provided)
       --num-min-timing-iter=[num_iters] Number of minimization timing iterations
                                         used to select kernels
       --num-avg-timing-iters=[num_iters]
diff --git a/cpp/trtorchc/main.cpp b/cpp/trtorchc/main.cpp
@@ -135,6 +135,18 @@ std::vector<std::vector<int64_t>> parseDynamicDim(std::string shape_str) {
   return shape;
 }
 
+std::string read_buf(std::string const& path) {
+  std::string buf;
+  std::ifstream stream(path.c_str(), std::ios::binary);
+
+  if (stream) {
+    stream >> std::noskipws;
+    std::copy(std::istream_iterator<char>(stream), std::istream_iterator<char>(), std::back_inserter(buf));
+  }
+
+  return buf;
+}
+
 std::string get_cwd() {
   char buff[FILENAME_MAX]; // create string buffer to hold path
   if (getcwd(buff, FILENAME_MAX)) {
@@ -224,6 +236,13 @@ int main(int argc, char** argv) {
       "file_path",
       "Path to calibration cache file to use for post training quantization",
       {"calibration-cache-file"});
+
+  args::Flag embed_engine(
+      parser,
+      "embed-engine",
+      "Whether to treat input file as a serialized TensorRT engine and embed it into a TorchScript module (device spec must be provided)",
+      {"embed-engine"});
+
   args::ValueFlag<int> num_min_timing_iters(
       parser, "num_iters", "Number of minimization timing iterations used to select kernels", {"num-min-timing-iter"});
   args::ValueFlag<int> num_avg_timing_iters(
@@ -484,6 +503,14 @@ int main(int argc, char** argv) {
   auto real_input_path = resolve_path(args::get(input_path));
   auto real_output_path = resolve_path(args::get(output_path));
 
+  // Instead of compiling, just embed engine in a PyTorch module
+  if (embed_engine) {
+    std::string serialized_engine = read_buf(real_input_path);
+    auto trt_mod = trtorch::EmbedEngineInNewModule(serialized_engine, compile_settings.device);
+    trt_mod.save(real_output_path);
+    return 0;
+  }
+
   torch::jit::Module mod;
   try {
     // Deserialize the ScriptModule from a file using torch::jit::load().
diff --git a/docsrc/tutorials/trtorchc.rst b/docsrc/tutorials/trtorchc.rst
@@ -19,79 +19,83 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
     trtorchc [input_file_path] [output_file_path]
         [input_specs...] {OPTIONS}
 
-    TRTorch is a compiler for TorchScript, it will compile and optimize
-    TorchScript programs to run on NVIDIA GPUs using TensorRT
+        TRTorch is a compiler for TorchScript, it will compile and optimize
+        TorchScript programs to run on NVIDIA GPUs using TensorRT
 
-  OPTIONS:
+      OPTIONS:
 
-      -h, --help                        Display this help menu
-      Verbiosity of the compiler
-        -v, --verbose                     Dumps debugging information about the
-                                          compilation process onto the console
-        -w, --warnings                    Disables warnings generated during
-                                          compilation onto the console (warnings
-                                          are on by default)
-        --i, --info                       Dumps info messages generated during
-                                          compilation onto the console
-      --build-debuggable-engine         Creates a debuggable engine
-      --use-strict-types                Restrict operating type to only use set
-                                        operation precision
-      --allow-gpu-fallback              (Only used when targeting DLA
-                                        (device-type)) Lets engine run layers on
-                                        GPU if they are not supported on DLA
-      --disable-tf32                    Prevent Float32 layers from using the
-                                        TF32 data format
-      -p[precision...],
-      --enabled-precison=[precision...] (Repeatable) Enabling an operating
-                                        precision for kernels to use when
-                                        building the engine (Int8 requires a
-                                        calibration-cache argument) [ float |
-                                        float32 | f32 | half | float16 | f16 |
-                                        int8 | i8 ] (default: float)
-      -d[type], --device-type=[type]    The type of device the engine should be
-                                        built for [ gpu | dla ] (default: gpu)
-      --gpu-id=[gpu_id]                 GPU id if running on multi-GPU platform
-                                        (defaults to 0)
-      --dla-core=[dla_core]             DLACore id if running on available DLA
-                                        (defaults to 0)
-      --engine-capability=[capability]  The type of device the engine should be
-                                        built for [ default | safe_gpu |
-                                        safe_dla ]
-      --calibration-cache-file=[file_path]
-                                        Path to calibration cache file to use
-                                        for post training quantization
-      --num-min-timing-iter=[num_iters] Number of minimization timing iterations
-                                        used to select kernels
-      --num-avg-timing-iters=[num_iters]
-                                        Number of averaging timing iterations
-                                        used to select kernels
-      --workspace-size=[workspace_size] Maximum size of workspace given to
-                                        TensorRT
-      --max-batch-size=[max_batch_size] Maximum batch size (must be >= 1 to be
-                                        set, 0 means not set)
-      -t[threshold],
-      --threshold=[threshold]           Maximum acceptable numerical deviation
-                                        from standard torchscript output
-                                        (default 2e-5)
-      --save-engine                     Instead of compiling a full a
-                                        TorchScript program, save the created
-                                        engine to the path specified as the
-                                        output path
-      input_file_path                   Path to input TorchScript file
-      output_file_path                  Path for compiled TorchScript (or
-                                        TensorRT engine) file
-      input_specs...                    Specs for inputs to engine, can either
-                                        be a single size or a range defined by
-                                        Min, Optimal, Max sizes, e.g.
-                                        "(N,..,C,H,W)"
-                                        "[(MIN_N,..,MIN_C,MIN_H,MIN_W);(OPT_N,..,OPT_C,OPT_H,OPT_W);(MAX_N,..,MAX_C,MAX_H,MAX_W)]".
-                                        Data Type and format can be specified by
-                                        adding an "@" followed by dtype and "%"
-                                        followed by format to the end of the
-                                        shape spec. e.g. "(3, 3, 32,
-                                        32)@f16%NHWC"
-      "--" can be used to terminate flag options and force all following
-      arguments to be treated as positional options
+          -h, --help                        Display this help menu
+          Verbiosity of the compiler
+            -v, --verbose                     Dumps debugging information about the
+                                              compilation process onto the console
+            -w, --warnings                    Disables warnings generated during
+                                              compilation onto the console (warnings
+                                              are on by default)
+            --i, --info                       Dumps info messages generated during
+                                              compilation onto the console
+          --build-debuggable-engine         Creates a debuggable engine
+          --use-strict-types                Restrict operating type to only use set
+                                            operation precision
+          --allow-gpu-fallback              (Only used when targeting DLA
+                                            (device-type)) Lets engine run layers on
+                                            GPU if they are not supported on DLA
+          --disable-tf32                    Prevent Float32 layers from using the
+                                            TF32 data format
+          -p[precision...],
+          --enabled-precison=[precision...] (Repeatable) Enabling an operating
+                                            precision for kernels to use when
+                                            building the engine (Int8 requires a
+                                            calibration-cache argument) [ float |
+                                            float32 | f32 | half | float16 | f16 |
+                                            int8 | i8 ] (default: float)
+          -d[type], --device-type=[type]    The type of device the engine should be
+                                            built for [ gpu | dla ] (default: gpu)
+          --gpu-id=[gpu_id]                 GPU id if running on multi-GPU platform
+                                            (defaults to 0)
+          --dla-core=[dla_core]             DLACore id if running on available DLA
+                                            (defaults to 0)
+          --engine-capability=[capability]  The type of device the engine should be
+                                            built for [ default | safe_gpu |
+                                            safe_dla ]
+          --calibration-cache-file=[file_path]
+                                            Path to calibration cache file to use
+                                            for post training quantization
+          --embed-engine                    Whether to treat input file as a
+                                            serialized TensorRT engine and embed it
+                                            into a TorchScript module (device spec
+                                            must be provided)
+          --num-min-timing-iter=[num_iters] Number of minimization timing iterations
+                                            used to select kernels
+          --num-avg-timing-iters=[num_iters]
+                                            Number of averaging timing iterations
+                                            used to select kernels
+          --workspace-size=[workspace_size] Maximum size of workspace given to
+                                            TensorRT
+          --max-batch-size=[max_batch_size] Maximum batch size (must be >= 1 to be
+                                            set, 0 means not set)
+          -t[threshold],
+          --threshold=[threshold]           Maximum acceptable numerical deviation
+                                            from standard torchscript output
+                                            (default 2e-5)
+          --save-engine                     Instead of compiling a full a
+                                            TorchScript program, save the created
+                                            engine to the path specified as the
+                                            output path
+          input_file_path                   Path to input TorchScript file
+          output_file_path                  Path for compiled TorchScript (or
+                                            TensorRT engine) file
+          input_specs...                    Specs for inputs to engine, can either
+                                            be a single size or a range defined by
+                                            Min, Optimal, Max sizes, e.g.
+                                            "(N,..,C,H,W)"
+                                            "[(MIN_N,..,MIN_C,MIN_H,MIN_W);(OPT_N,..,OPT_C,OPT_H,OPT_W);(MAX_N,..,MAX_C,MAX_H,MAX_W)]".
+                                            Data Type and format can be specified by
+                                            adding an "@" followed by dtype and "%"
+                                            followed by format to the end of the
+                                            shape spec. e.g. "(3, 3, 32,
+                                            32)@f16%NHWC"
+          "--" can be used to terminate flag options and force all following
+          arguments to be treated as positional options
 
 
 e.g.