# A Programm to Test The Pooling Custom Layer TensorRT Plugin

In [1]:
input_size = 225
win_size = 3
stride = 2
output_size = input_size // stride

In [2]:
import numpy as np
import torch

x = np.random.randn(3, input_size, input_size)
x = np.float32(x)
input_tensor = torch.tensor(x)
input_batch = input_tensor.unsqueeze(0)

In [3]:
input_batch.shape

torch.Size([1, 3, 225, 225])

In [4]:
input_batch

tensor([[[[ 1.9086,  0.6233,  0.4397,  ..., -0.9460,  1.3901, -0.9907],
          [-0.8772, -0.7359, -0.9987,  ..., -0.9756,  2.2991, -0.1512],
          [ 0.6496,  0.1291, -0.2755,  ...,  1.3561, -0.4766,  0.5580],
          ...,
          [ 0.0264, -1.9654,  0.6396,  ...,  1.8939, -1.3221,  0.4334],
          [ 1.1496,  0.0731, -1.7232,  ..., -0.4995,  0.8761, -0.4739],
          [ 1.0310, -0.5421, -1.3319,  ..., -1.3780, -1.7239, -0.2046]],

         [[ 0.9479, -0.6372,  1.2032,  ..., -1.0781,  0.6603,  0.6303],
          [-0.2466, -0.3848,  0.1896,  ..., -0.6849, -0.1012,  0.4828],
          [-0.0541, -0.6104,  2.1602,  ..., -0.1407,  1.9809, -0.0902],
          ...,
          [-0.3290,  0.0813, -1.3113,  ..., -1.2205, -1.7088, -1.1775],
          [-0.8990,  1.8290, -0.8415,  ...,  0.4334, -0.3941,  1.8447],
          [-0.4006, -0.1474, -0.0248,  ...,  1.1039,  0.7396,  1.1029]],

         [[ 0.3785,  0.2741,  0.6076,  ..., -1.3106, -0.2222,  0.2296],
          [-1.2488,  3.1606, -

In [5]:
def define_trt_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))

    # MaxPool2d
    layer = network.add_pooling_nd( \
        input=input_tensor, type=trt.PoolingType.MAX, window_size=(win_size, win_size))
    layer.stride_nd = (stride, stride)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [6]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        # Define the TRT network using weights from the PyTorch model.
        #define_trt_plugin_network(network)
        define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [7]:
import tensorrt as trt
import common

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
net_dict = None
with build_engine(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

[06/22/2022-05:30:07] [TRT] [I] [MemUsageChange] Init CUDA: CPU +202, GPU +0, now: CPU 284, GPU 3239 (MiB)
[06/22/2022-05:30:11] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +141, GPU +131, now: CPU 444, GPU 3385 (MiB)
[06/22/2022-05:30:11] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 444, GPU 3385 (MiB)
[06/22/2022-05:30:11] [TRT] [V] Applying generic optimizations to the graph for inference.
[06/22/2022-05:30:11] [TRT] [V] Original: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After dead-layer removal: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After Myelin optimization: 1 layers
[06/22/2022-05:30:11] [TRT] [V] Applying ScaleNodes fusions.
[06/22/2022-05:30:11] [TRT] [V] After scale fusion: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After vertical fusions: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After dupe layer removal: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After final dead-layer removal: 1 layers
[06/22/2022-05:30:11] [TRT] [V] After tensor merging: 1 l

In [8]:
import pandas as pd

pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [Pooling],LayerType.POOLING,"(1, 3, 225, 225)","(1, 3, 112, 112)","type=PoolingType.MAX wsize=(3, 3) stride=(2, 2..."


In [9]:
reference = trt_outputs[0].reshape((3, input_size // 2, input_size // 2))
print(reference)

[[[1.9086162  2.1730902  2.1730902  ... 2.7569187  1.3561108  2.299056  ]
  [2.2189717  1.9053221  1.7694458  ... 1.0396745  1.3561108  1.3561108 ]
  [2.6266236  0.7471319  1.8011373  ... 2.1103199  2.1103199  1.1804656 ]
  ...
  [1.2724688  1.2353026  1.3805821  ... 1.6026468  1.1303796  1.2193415 ]
  [0.6395517  0.6395517  1.3805821  ... 0.77880335 1.893937   1.893937  ]
  [1.1496416  1.3268417  1.3268417  ... 1.7681036  1.893937   1.893937  ]]

 [[2.1601562  2.1601562  1.9995507  ... 1.1893888  1.1567457  1.980887  ]
  [2.1601562  2.1601562  1.1974529  ... 1.1893888  1.4159371  2.6569247 ]
  [0.5783027  0.7675895  1.1974529  ... 2.2577891  2.0299096  2.6569247 ]
  ...
  [1.39357    1.2697114  2.4497547  ... 2.2015767  1.4375938  1.5441084 ]
  [1.2697114  1.2940738  1.9532217  ... 1.5873559  1.5873559  1.9786321 ]
  [1.8290291  1.2940738  3.39205    ... 1.5873559  1.5873559  1.8447142 ]]

 [[3.1606023  0.79254484 2.1143072  ... 2.3131964  2.3131964  1.2901431 ]
  [2.0867102  1.236468

In [10]:
import sys
import os

cur_path = %pwd
plugin_path = os.path.join(cur_path, 'plugin')
sys.path.append(plugin_path)
from trt_plugin_pb2 import copy_Message
from trt_plugin_pb2 import pooling_Message
import trt_plugin_pb2

In [11]:
import ctypes

lib_file = os.path.join(plugin_path, 'build', 'libPoolingPlugin.so')
lib = ctypes.CDLL(lib_file)

In [12]:
import tensorrt as trt

registry = trt.get_plugin_registry()
print([c.name for c in registry.plugin_creator_list])
print([c.plugin_namespace for c in registry.plugin_creator_list])

['CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'RnRes2Br1Br2c_TRT', 'RnRes2Br1Br2c_TRT', 'RnRes2FullFusion_TRT', 'SmallTileGEMM_TRT', 'RNNTEncoderPlugin', 'DLRM_BOTTOM_MLP_TRT', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'SingleStepLSTMPlugin', 'RnRes2Br2bBr2c_TRT', 'RnRes2Br2bBr2c_TRT', 'CustomGeluPluginDynamic', 'CustomFCPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'pooling', 'copy']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'macnica_trt_plugins', 'macnica_trt_plugins']


In [13]:
namespace = 'macnica_trt_plugins'
macnica_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == namespace]
for c in macnica_creators:
    registry.register_creator(c, namespace)

In [14]:
def define_trt_plugin_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))
    
    ### Custom Pooling Layer with CUDA or cuDNN ###
    creator = registry.get_plugin_creator( \
        type='pooling', version='1', plugin_namespace='macnica_trt_plugins')
    sz = input_tensor.shape
    message = pooling_Message( \
        dims=sz, mode=trt_plugin_pb2.Maximum, window=[win_size, win_size], \
        stride=[stride, stride], impl=trt_plugin_pb2.CUDA)
    plg = creator.deserialize_plugin('pooling', message.SerializeToString())
    layer = network.add_plugin_v2(inputs=[input_tensor], plugin=plg)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [15]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine2(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        # Define the TRT network using weights from the PyTorch model.
        define_trt_plugin_network(network)
        #define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [16]:
net_dict = None
with build_engine2(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

[06/22/2022-05:30:14] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 792, GPU 3727 (MiB)
[06/22/2022-05:30:14] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 792, GPU 3727 (MiB)
[06/22/2022-05:30:14] [TRT] [V] Applying generic optimizations to the graph for inference.
[06/22/2022-05:30:14] [TRT] [V] Original: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After dead-layer removal: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After Myelin optimization: 1 layers
[06/22/2022-05:30:14] [TRT] [V] Applying ScaleNodes fusions.
[06/22/2022-05:30:14] [TRT] [V] After scale fusion: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After vertical fusions: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After dupe layer removal: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After final dead-layer removal: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After tensor merging: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After slice removal: 1 layers
[06/22/2022-05:30:14] [TRT] [V] After concat removal: 1 layer

In [17]:
pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [PluginV2Ext],LayerType.PLUGIN_V2,"(1, 3, 225, 225)","(1, 3, 112, 112)",


In [18]:
result = trt_outputs[0].reshape((3, output_size, output_size))
print(result)

[[[1.9086162  2.1730902  2.1730902  ... 2.7569187  1.3561108  2.299056  ]
  [2.2189717  1.9053221  1.7694458  ... 1.0396745  1.3561108  1.3561108 ]
  [2.6266236  0.7471319  1.8011373  ... 2.1103199  2.1103199  1.1804656 ]
  ...
  [1.2724688  1.2353026  1.3805821  ... 1.6026468  1.1303796  1.2193415 ]
  [0.6395517  0.6395517  1.3805821  ... 0.77880335 1.893937   1.893937  ]
  [1.1496416  1.3268417  1.3268417  ... 1.7681036  1.893937   1.893937  ]]

 [[2.1601562  2.1601562  1.9995507  ... 1.1893888  1.1567457  1.980887  ]
  [2.1601562  2.1601562  1.1974529  ... 1.1893888  1.4159371  2.6569247 ]
  [0.5783027  0.7675895  1.1974529  ... 2.2577891  2.0299096  2.6569247 ]
  ...
  [1.39357    1.2697114  2.4497547  ... 2.2015767  1.4375938  1.5441084 ]
  [1.2697114  1.2940738  1.9532217  ... 1.5873559  1.5873559  1.9786321 ]
  [1.8290291  1.2940738  3.39205    ... 1.5873559  1.5873559  1.8447142 ]]

 [[3.1606023  0.79254484 2.1143072  ... 2.3131964  2.3131964  1.2901431 ]
  [2.0867102  1.236468

In [19]:
print(sum(abs(result.flatten() - reference.flatten())) / len(result.flatten()))

0.0
