# A Programm to Test The Pooling Custom Layer TensorRT Plugin

In [1]:
input_size = 225
win_size = 3
stride = 2
output_size = input_size // stride

In [2]:
import numpy as np
import torch

x = np.random.randn(3, input_size, input_size)
x = np.float32(x)
input_tensor = torch.tensor(x)
input_batch = input_tensor.unsqueeze(0)

In [3]:
input_batch.shape

torch.Size([1, 3, 225, 225])

In [4]:
input_batch

tensor([[[[ 2.6501, -0.3462, -1.3415,  ..., -0.6482, -0.5050, -2.6726],
          [ 0.9051, -1.4667, -1.3951,  ..., -0.1859,  2.0105,  1.2380],
          [ 0.0251,  0.8789, -0.5912,  ..., -0.6022, -0.3832, -0.7878],
          ...,
          [ 0.8270,  0.1583, -0.5770,  ...,  0.1054,  1.2273,  0.0277],
          [-0.9453, -0.9123, -0.1213,  ...,  0.3165,  0.6516,  0.6366],
          [ 0.1720,  0.1371,  2.1517,  ...,  1.0750, -0.8371, -0.0215]],

         [[ 0.1252, -1.1103,  0.7704,  ...,  0.4681,  1.0899,  0.3511],
          [-0.0417,  0.4086, -0.4040,  ..., -0.9465, -2.4532,  0.1196],
          [ 1.4319, -0.1796, -1.8485,  ..., -0.7844,  1.1484, -0.3196],
          ...,
          [ 1.2828,  0.1004,  1.1142,  ...,  0.9532,  1.3244, -2.0370],
          [-0.8343,  1.0911, -0.8877,  ...,  1.2077,  1.0046,  0.8553],
          [ 0.0198, -1.5841, -0.6951,  ...,  0.1021,  1.2555, -2.3765]],

         [[-1.4285, -0.9405,  0.7697,  ...,  0.0079, -2.0130,  0.3904],
          [ 1.6152, -0.2463,  

In [5]:
def define_trt_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))

    # MaxPool2d
    layer = network.add_pooling_nd( \
        input=input_tensor, type=trt.PoolingType.MAX, window_size=(win_size, win_size))
    layer.stride_nd = (stride, stride)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [6]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        if hasattr(config, 'set_memory_pool_limit'):
            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        else:
            config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        #define_trt_plugin_network(network)
        define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [7]:
import tensorrt as trt
import common

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
net_dict = None
with build_engine(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

In [8]:
import pandas as pd

pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [Pooling],LayerType.POOLING,"(1, 3, 225, 225)","(1, 3, 112, 112)","type=PoolingType.MAX wsize=(3, 3) stride=(2, 2..."


In [9]:
reference = trt_outputs[0].reshape((3, input_size // 2, input_size // 2))
print(reference)

[[[2.6500769  3.2290633  1.2284403  ... 0.00681744 0.3772723  2.0105433 ]
  [1.007256   0.9180095  0.9180095  ... 1.4345497  1.4345497  0.37398094]
  [1.99439    0.73119545 0.73119545 ... 1.0924299  1.2929664  0.79655266]
  ...
  [1.448458   1.448458   2.810543   ... 1.625236   1.3118337  1.3118337 ]
  [1.9775156  1.448458   1.5463194  ... 0.44959897 1.2018094  1.8659633 ]
  [2.1517208  2.1517208  2.2942665  ... 1.4624052  1.2018094  1.227329  ]]

 [[1.4319263  0.77039176 2.1400537  ... 0.5548143  0.8839576  1.1483607 ]
  [2.3155487  1.7184061  1.7184061  ... 1.2364593  1.565932   1.1483607 ]
  [2.3155487  2.2861629  1.505162   ... 1.2364593  1.5126514  1.7555597 ]
  ...
  [1.7056639  0.43093443 1.6173016  ... 0.73323804 1.6115369  1.6105862 ]
  [1.7056639  1.1142296  1.0374436  ... 2.5228689  0.9532252  1.3243661 ]
  [1.2828177  1.1142296  0.60522914 ... 1.1300025  1.2077414  1.3243661 ]]

 [[1.6152198  2.0561554  2.0561554  ... 2.500008   0.20238331 0.9177837 ]
  [0.70294493 2.056155

In [10]:
import sys
import os

cur_path = %pwd
plugin_path = os.path.join(cur_path, 'plugin')
sys.path.append(plugin_path)
from trt_plugin_pb2 import copy_Message
from trt_plugin_pb2 import pooling_Message
import trt_plugin_pb2

In [11]:
import ctypes

lib_file = os.path.join(plugin_path, 'build', 'libPoolingPlugin.so')
lib = ctypes.CDLL(lib_file)

In [12]:
import tensorrt as trt

registry = trt.get_plugin_registry()
print([c.name for c in registry.plugin_creator_list])
print([c.plugin_namespace for c in registry.plugin_creator_list])

['CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'RnRes2Br1Br2c_TRT', 'RnRes2Br1Br2c_TRT', 'RnRes2FullFusion_TRT', 'SmallTileGEMM_TRT', 'RNNTEncoderPlugin', 'DLRM_BOTTOM_MLP_TRT', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'SingleStepLSTMPlugin', 'RnRes2Br2bBr2c_TRT', 'RnRes2Br2bBr2c_TRT', 'CustomGeluPluginDynamic', 'CustomFCPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'pooling', 'copy']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'macnica_trt_plugins', 'macnica_trt_plugins']


In [13]:
namespace = 'macnica_trt_plugins'
macnica_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == namespace]
for c in macnica_creators:
    registry.register_creator(c, namespace)

In [14]:
def define_trt_plugin_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))
    
    ### Custom Pooling Layer with CUDA or cuDNN ###
    creator = registry.get_plugin_creator( \
        type='pooling', version='1', plugin_namespace='macnica_trt_plugins')
    sz = input_tensor.shape
    message = pooling_Message( \
        dims=sz, mode=trt_plugin_pb2.Maximum, window=[win_size, win_size], \
        stride=[stride, stride], impl=trt_plugin_pb2.CUDA)
    plg = creator.deserialize_plugin('pooling', message.SerializeToString())
    layer = network.add_plugin_v2(inputs=[input_tensor], plugin=plg)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [15]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine2(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        if hasattr(config, 'set_memory_pool_limit'):
            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        else:
            config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        define_trt_plugin_network(network)
        #define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [16]:
net_dict = None
with build_engine2(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.
Process started.
Process finished.


In [17]:
pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [PluginV2Ext],LayerType.PLUGIN_V2,"(1, 3, 225, 225)","(1, 3, 112, 112)",


In [18]:
result = trt_outputs[0].reshape((3, output_size, output_size))
print(result)

[[[2.6500769  3.2290633  1.2284403  ... 0.00681744 0.3772723  2.0105433 ]
  [1.007256   0.9180095  0.9180095  ... 1.4345497  1.4345497  0.37398094]
  [1.99439    0.73119545 0.73119545 ... 1.0924299  1.2929664  0.79655266]
  ...
  [1.448458   1.448458   2.810543   ... 1.625236   1.3118337  1.3118337 ]
  [1.9775156  1.448458   1.5463194  ... 0.44959897 1.2018094  1.8659633 ]
  [2.1517208  2.1517208  2.2942665  ... 1.4624052  1.2018094  1.227329  ]]

 [[1.4319263  0.77039176 2.1400537  ... 0.5548143  0.8839576  1.1483607 ]
  [2.3155487  1.7184061  1.7184061  ... 1.2364593  1.565932   1.1483607 ]
  [2.3155487  2.2861629  1.505162   ... 1.2364593  1.5126514  1.7555597 ]
  ...
  [1.7056639  0.43093443 1.6173016  ... 0.73323804 1.6115369  1.6105862 ]
  [1.7056639  1.1142296  1.0374436  ... 2.5228689  0.9532252  1.3243661 ]
  [1.2828177  1.1142296  0.60522914 ... 1.1300025  1.2077414  1.3243661 ]]

 [[1.6152198  2.0561554  2.0561554  ... 2.500008   0.20238331 0.9177837 ]
  [0.70294493 2.056155

In [19]:
print(sum(abs(result.flatten() - reference.flatten())) / len(result.flatten()))

0.0
