Building the BEVFusion TRT engines via the C++ API instead of `trtexec`, they run but give garbage results #240

josh-wende · 2024-03-19T02:14:25Z

I am trying to integrate this TensorRT implementation of BEVFusion into an existing codebase that has TensorRT but not trtexec. As such, I have been trying to get the engines to build via the C++ API instead of trtexec and the build_trt_engine.sh script. I got it to build and run inference without error, but the results I get are garbage and do not at all match the ones I get in your standalone implementation.

The code I wrote to build the engine is as follows (with TensorRT version 8.6.1.6-1+cuda11.8):

bool BEVEngine::Build(bool int8) {
  // create the builder
  const int kBatchSize = 1;
  const auto explicit_batch =
      static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  builder_ = nvinfer1::createInferBuilder(rt_gLogger);

  network_ = builder_->createNetworkV2(explicit_batch);

  // parse onnx model
  auto parser = nvonnxparser::createParser(*network_, rt_gLogger);
  int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kVERBOSE);
  if (!parser->parseFromFile(onnx_filepath_.c_str(), verbosity)) {
    return false;
  }

  builder_->setMaxBatchSize(kBatchSize);

  builder_config_ = builder_->createBuilderConfig();

  workspaceSize_ = 1 << 31;
  builder_config_->setMaxWorkspaceSize(workspaceSize_);

  // int8 is set to true for resnet50int8, except for camera.vtransform.onnx and
  // head.bbox.onnx, which do not have any Quantize/Dequantize layers
  if (int8) {
    if (builder_->platformHasFastInt8()) {
      builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
    } else {
      return false;
    }
  }

  const int numInputs = network_->getNbInputs();
  const int numOutputs = network_->getNbOutputs();

  // Check that the GPU supports FP16 inference and set input/output types if so
  if (builder_->platformHasFastFp16()) {
    builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
    // set inputs and outputs to fp16:chw
    for (int i = 0; i < numInputs; ++i) {
      network_->getInput(i)->setType(nvinfer1::DataType::kHALF);
      network_->getInput(i)->setAllowedFormats(
          1U << static_cast<int>(nvinfer1::TensorFormat::kCHW16));
    }
    for (int i = 0; i < numOutputs; ++i) {
      network_->getOutput(i)->setType(nvinfer1::DataType::kHALF);
      network_->getOutput(i)->setAllowedFormats(
          1U << static_cast<int>(nvinfer1::TensorFormat::kCHW16));
    }
  } else {
    return false;
  }

  builder_config_->setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);

  // Register a single optimization profile
  nvinfer1::IOptimizationProfile* optProfile =
      builder_->createOptimizationProfile();
  for (int32_t i = 0; i < numInputs; ++i) {
    const auto input = network_->getInput(i);
    const auto inputName = input->getName();
    nvinfer1::Dims inputDims = input->getDimensions();
    nvinfer1::Dims modifiedDims = inputDims;
    modifiedDims.d[0] = kBatchSize;

    optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN,
                              modifiedDims);
    optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
                              modifiedDims);
    optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
                              modifiedDims);
  }
  builder_config_->addOptimizationProfile(optProfile);

  // CUDA stream used for profiling by the builder.
  cudaStream_t profileStream;
  cudaError_t return_code = cudaStreamCreate(&profileStream);
  if (return_code != 0) {
    return false;
  }
  builder_config_->setProfileStream(profileStream);

  // Build the engine
  std::unique_ptr<nvinfer1::IHostMemory> plan{
      builder_->buildSerializedNetwork(*network_, *builder_config_)};

  // Write the engine to disk
  std::filesystem::create_directory(root_dir_ + "build");
  std::ofstream outfile(root_dir_ + "build/" + net_name_ + ".plan",
                        std::ofstream::binary);
  outfile.write(reinterpret_cast<const char*>(plan->data()), plan->size());

  return_code = cudaStreamDestroy(profileStream);
  if (return_code != 0) {
    return false;
  }
  return true;
}

This successfully builds the engines and saves them to the *.plan files. When I try to run a sample image that works well with the trtexec version, I get the following garbage outputs:

==================BEVFusion===================
[⏰ [NoSt] CopyLidar]: 	0.10854 ms
[⏰ [NoSt] ImageNrom]: 	0.83587 ms
[⏰ Lidar Backbone]: 	2.01750 ms
[⏰ Camera Depth]: 	0.03994 ms
[⏰ Camera Backbone]: 	41.31533 ms
[⏰ Camera Bevpool]: 	0.39117 ms
[⏰ VTransform]: 	1.16736 ms
[⏰ Transfusion]: 	9.61210 ms
[⏰ Head BoundingBox]: 	3.25120 ms
Total: 57.795 ms
=============================================
num bboxes: 12
object id: 1
position: -54 | 38.55 | -0.5
velocity: 0 | 131.625
size: 1 | inf | 1
score: 176.875
object id: 3
position: -54 | -23.6437 | -0.5
velocity: 0 | 84.5625
size: 1 | inf | 1
score: 175.625
object id: 7
position: -54 | 37.275 | -0.5
velocity: 0 | 109.5
size: 1 | 1.30696e+27 | 1
score: 174.25
object id: 5
position: -54 | -16.7812 | -0.5
velocity: 0 | 16.8125
size: 1 | inf | 1
score: 174.25
object id: 1
position: -54 | -15.7875 | -0.5
velocity: 0 | 39.5
size: 1 | 3.80011e+07 | 1
score: 164.375
object id: 7
position: -54 | 52.275 | -0.5
velocity: 0 | 129.5
size: 1 | 7.5437e+21 | 1
score: 163
object id: 3
position: -54 | 36.375 | -0.5
velocity: 0 | 85.875
size: 1 | 2.74451e+12 | 1
score: 162.5
object id: 1
position: -54 | 21.9375 | -0.5
velocity: 0 | 40.0625
size: 1 | inf | 1
score: 152.375
object id: 5
position: -54 | -23.0625 | -0.5
velocity: 0 | 37.875
size: 1 | 6.86862e+21 | 1
score: 141.375
object id: 7
position: -54 | -37.1156 | -0.5
velocity: 0 | 153.75
size: 1 | 1.74452e+12 | 1
score: 141.25
object id: 9
position: -54 | 44.325 | -0.5
velocity: 0 | 153.625
size: 1 | 3.62929e+26 | 1
score: 141
object id: 5
position: -54 | 24 | -0.5
velocity: 0 | 106.562
size: 1 | 1.473e+17 | 1
score: 128.25

As you can see, there are infs and other nonsense values with odd patterns among them. When building the engine, the following warning is logged for the head, which seems like a likely cause for bad outputs:

[W] Detected layernorm nodes in FP16: Sub_352, Pow_354, ReduceMean_355, Add_357, Sqrt_358, Div_359, Mul_360, Add_361, Sub_460, Pow_462, ReduceMean_463, Add_465, Sqrt_466, Div_467, Mul_468, Add_469, Sub_477, Pow_479, ReduceMean_480, Add_482, Sqrt_483, Div_484, Mul_485, Add_486
[W] Running layernorm after self-attention in FP16 may cause overflow. Exporting the model to the latest available ONNX opset (later than opset 17) to use the INormalizationLayer, or forcing layernorm layers to run in FP32 precision can help with preserving accuracy.

As such, I added the following code to try to force the offending layers to use FP32:

  const int num_layers = network_->getNbLayers();
  // these are taken from the warning that otherwise logs
  std::vector<std::string> layernorm_names = {
      "Sub_460",  "Pow_462", "ReduceMean_463", "Add_465",
      "Sqrt_466", "Div_467", "Mul_468",        "Add_469",
      "Sub_477",  "Pow_479", "ReduceMean_480", "Add_482",
      "Sqrt_483", "Div_484", "Mul_485",        "Add_486",
      "Sub_352",  "Pow_354", "ReduceMean_355", "Add_357",
      "Sqrt_358", "Div_359", "Mul_360",        "Add_361"};

  if (onnx_file_ == "head.bbox.onnx") {
    // loop over all layers and set the ones from the warning to FP32
    for (int i = 0; i < num_layers; i++) {
      nvinfer1::ILayer* layer = network_->getLayer(i);
      if (std::find(layernorm_names.begin(), layernorm_names.end(),
                    std::string(layer->getName())) != layernorm_names.end()) {
        layer->setPrecision(nvinfer1::DataType::kFLOAT);
        for (int j = 0; j < layer->getNbOutputs(); j++) {
          layer->setOutputType(j, nvinfer1::DataType::kFLOAT);
        }
      }
    }
  }

But the outputs are still garbage and, very bizarrely, the warning still shows but now reads:

[W] Detected layernorm nodes in FP16: , , , , , , , , , , , , , , , , , , , , , , ,

With all of the layer names just replaced with empty strings, but still listed. I tried forcing all of the layers 1 or 5 or 10 before and after the listed layernorm layers to be FP32 as well, but it didn't help.

Why might my model give such garbage outputs? I used polygraphy to compare the outputs in ONNX and TensorRT with the command polygraphy run caamera.backbone.onnx --trt --fp16 --onnxrt --precision-constraints=obey. It fails on most of the resnet50int8 .onnx files due to non-positive scaling values in some of the Q/DQ layers, and it fails on the lidar backbone because of the sparse convolution, but it works on the rest in resnet50, and it shows essentially no error in fp32, and only small error in fp16 except for the outputs of head.bbox.onnx. So it really seems likely that those layernorm layers and the resultant fp16 overflow are the cause of the bad values I'm getting, but how can I set them properly with the C++ API?

Thanks in advance for any help.

The text was updated successfully, but these errors were encountered:

josh-wende · 2024-03-19T14:20:07Z

Actually, I'm now less certain that the layernorm layers are the (only) issue. When I only set the model inputs and outputs to FP16 but leave the rest of the network using FP32, the layernorm FP16 overflow warning no longer shows up, but the values are still bad, both for resnet50int8 and resnet50.

resnet50int8:

==================BEVFusion===================
[⏰ [NoSt] CopyLidar]: 	0.11366 ms
[⏰ [NoSt] ImageNrom]: 	0.84246 ms
[⏰ Lidar Backbone]: 	2.04800 ms
[⏰ Camera Depth]: 	0.05530 ms
[⏰ Camera Backbone]: 	41.80797 ms
[⏰ Camera Bevpool]: 	0.36864 ms
[⏰ VTransform]: 	3.01875 ms
[⏰ Transfusion]: 	10.28707 ms
[⏰ Head BoundingBox]: 	12.10858 ms
Total: 69.694 ms
=============================================
frame 0:
num bboxes: 12
object id: 1
position: -54 | 51.525 | -0.5
velocity: 0 | 154
size: 1 | 1.61342e+12 | 1
score: 181
object id: 1
position: -54 | -2.7 | -0.5
velocity: 0 | 175.875
size: 1 | 9.09944e+21 | 1
score: 180.875
object id: 1
position: -54 | 11.25 | -0.5
velocity: 0 | 108.812
size: 1 | inf | 1
score: 176.75
object id: 1
position: -54 | 51.45 | -0.5
velocity: 0 | 106.688
size: 1 | 4.20885e+11 | 1
score: 174.375
object id: 5
position: -54 | -18.375 | -0.5
velocity: 0 | 130.75
size: 1 | inf | 1
score: 162.25
object id: 3
position: -54 | -37.8094 | -0.5
velocity: 0 | 5.57812
size: 1 | inf | 1
score: 159
object id: 9
position: -54 | -23.625 | -0.5
velocity: 0 | 109.125
size: 1 | inf | 1
score: 140.5
object id: 7
position: -54 | 36.75 | -0.5
velocity: 0 | 174
size: 1 | 3.33743e+27 | 1
score: 139.375
object id: 9
position: -54 | 11.625 | -0.5
velocity: 0 | 98.875
size: 1 | 1.7999e+12 | 1
score: 138.5
object id: 7
position: -54 | -16.7437 | -0.5
velocity: 0 | 49.875
size: 1 | inf | 1
score: 130
object id: 7
position: -54 | -16.3875 | -0.5
velocity: 0 | 110.125
size: 1 | inf | 1
score: 118.062
object id: 7
position: -54 | -39.6656 | -0.5
velocity: 0 | 86.5625
size: 1 | inf | 1
score: 117.562

resnet50:

==================BEVFusion===================
[⏰ [NoSt] CopyLidar]: 	0.07859 ms
[⏰ [NoSt] ImageNrom]: 	0.53136 ms
[⏰ Lidar Backbone]: 	2.92694 ms
[⏰ Camera Depth]: 	0.04198 ms
[⏰ Camera Backbone]: 	56.55242 ms
[⏰ Camera Bevpool]: 	0.32358 ms
[⏰ VTransform]: 	2.73101 ms
[⏰ Transfusion]: 	12.36144 ms
[⏰ Head BoundingBox]: 	9.36582 ms
Total: 84.303 ms
=============================================
frame 0:
num bboxes: 25
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 84.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 83.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 82.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 81.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 80.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 79.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 78.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 77.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 76.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 75.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 74.5
object id: 9
position: -54 | -53.6927 | -0.5
velocity: 0 | 0.512207
size: 1 | 1.66897 | 1
score: 73.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 72.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 71.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 70.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 69.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 68.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 67.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 66.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 65.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 64.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 63.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 62.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 61.5
object id: 8
position: -53.6927 | -54 | -0.322278
velocity: 0.512207 | 0
size: 1.66897 | 1 | 1.66897
score: 60.5

I see no more infs for the resnet50 in this case, but they are clearly still not real objects. There are lots of infs in the resnet50int8 output. All of the values in both still show weird patterns. Does anyone have any idea what could be causing these issues?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Building the BEVFusion TRT engines via the C++ API instead of `trtexec`, they run but give garbage results #240

Building the BEVFusion TRT engines via the C++ API instead of `trtexec`, they run but give garbage results #240

josh-wende commented Mar 19, 2024 •

edited

josh-wende commented Mar 19, 2024 •

edited

Building the BEVFusion TRT engines via the C++ API instead of trtexec, they run but give garbage results #240

Building the BEVFusion TRT engines via the C++ API instead of trtexec, they run but give garbage results #240

Comments

josh-wende commented Mar 19, 2024 • edited

josh-wende commented Mar 19, 2024 • edited

Building the BEVFusion TRT engines via the C++ API instead of `trtexec`, they run but give garbage results #240

Building the BEVFusion TRT engines via the C++ API instead of `trtexec`, they run but give garbage results #240

josh-wende commented Mar 19, 2024 •

edited

josh-wende commented Mar 19, 2024 •

edited