# A Programm to Test The Pooling Custom Layer TensorRT Plugin

In [1]:
input_size = 225
win_size = 3
stride = 2
output_size = input_size // stride

In [2]:
import numpy as np
import torch

x = np.random.randn(3, input_size, input_size)
x = np.float32(x)
input_tensor = torch.tensor(x)
input_batch = input_tensor.unsqueeze(0)

In [3]:
input_batch.shape

torch.Size([1, 3, 225, 225])

In [4]:
input_batch

tensor([[[[-0.0916, -0.2049, -2.5235,  ...,  1.6918,  0.8317,  1.0882],
          [ 0.5489, -0.2090, -0.5563,  ..., -0.1105, -0.7709, -1.1957],
          [-0.9123, -0.0200, -0.4492,  ..., -1.7014,  0.6693, -0.5958],
          ...,
          [ 0.4422, -0.3637, -0.2588,  ...,  0.4260, -0.5451,  1.1153],
          [-1.1933, -0.4092,  1.0897,  ..., -0.2070,  0.5956,  1.5591],
          [-0.1582, -0.4287, -0.9983,  ...,  0.4669, -1.6858, -2.4387]],

         [[ 0.7774, -1.3232, -0.3495,  ...,  1.2233,  0.6606,  0.7040],
          [ 0.6343,  0.3259, -0.2368,  ..., -0.6005, -0.4800,  0.3855],
          [-0.4922,  0.2318, -0.2382,  ..., -0.7706,  0.4016,  1.4081],
          ...,
          [-0.2572,  1.1225, -0.5840,  ...,  0.9771, -1.1394, -0.2674],
          [ 0.5636, -0.7101,  0.0241,  ...,  0.9924, -1.0682, -0.8150],
          [-2.3161,  1.6169, -0.0851,  ..., -1.1792,  0.1251, -0.0715]],

         [[-1.0708,  0.9127, -0.8217,  ..., -0.3503, -0.7841, -0.2210],
          [-1.1091, -0.5586,  

In [5]:
def define_trt_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))

    # MaxPool2d
    layer = network.add_pooling( \
        input=input_tensor, type=trt.PoolingType.MAX, window_size=(win_size, win_size))
    layer.stride = (stride, stride)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [6]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        #define_trt_plugin_network(network)
        define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [7]:
import tensorrt as trt
import common

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
net_dict = None
with build_engine(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

  import sys
  


In [8]:
import pandas as pd

pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [Pooling],LayerType.POOLING,"(1, 3, 225, 225)","(1, 3, 112, 112)","type=PoolingType.MAX wsize=(3, 3) stride=(2, 2..."


In [9]:
reference = trt_outputs[0].reshape((3, input_size // 2, input_size // 2))
print(reference)

[[[ 0.54893416  1.989144    1.989144   ...  1.3599222   1.6918298
    1.6918298 ]
  [ 0.15404245  0.5594461   1.059617   ...  2.011133    0.85054946
    0.66932017]
  [ 1.4160961   2.2378023   2.2378023  ...  1.4213791   0.39022008
    0.39022008]
  ...
  [ 1.8694088   0.92679745  0.67997384 ...  1.9961305   1.6812259
    0.97576284]
  [ 1.8694088   1.1513      1.1513     ...  1.9961305   1.7335786
    1.1152521 ]
  [ 1.0897418   1.4958568   1.4958568  ...  2.5603087   2.5603087
    1.5591258 ]]

 [[ 0.7773949   1.9087864   1.3096867  ...  2.6800668   2.4206667
    1.4081054 ]
  [ 1.6873859   2.2672944   1.0372812  ...  1.8019952   1.5644099
    1.5644099 ]
  [ 1.6463834   0.28332746  1.3296348  ...  0.824375    1.0844419
    0.9269205 ]
  ...
  [ 1.4914231   0.68575054  1.4856282  ...  1.7178365   1.7178365
    1.8403813 ]
  [ 1.4914231   0.94579667  0.92012715 ...  1.5533671   1.5533671
    1.8403813 ]
  [ 1.6168652   1.4979389   2.0956516  ...  0.57644784  0.9924499
    0.9924499 ]]

In [10]:
import sys
import os

cur_path = %pwd
plugin_path = os.path.join(cur_path, 'plugin')
sys.path.append(plugin_path)
from trt_plugin_pb2 import copy_Message
from trt_plugin_pb2 import pooling_Message
import trt_plugin_pb2

In [11]:
import ctypes

lib_file = os.path.join(plugin_path, 'build', 'libPoolingPlugin.so')
lib = ctypes.CDLL(lib_file)

In [12]:
import tensorrt as trt

registry = trt.get_plugin_registry()
print([c.name for c in registry.plugin_creator_list])
print([c.plugin_namespace for c in registry.plugin_creator_list])

['CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'RnRes2Br1Br2c_TRT', 'RnRes2Br1Br2c_TRT', 'CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomGeluPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CgPersistentLSTMPlugin_TRT', 'SingleStepLSTMPlugin', 'RnRes2Br2bBr2c_TRT', 'RnRes2Br2bBr2c_TRT', 'CustomFCPluginDynamic', 'CustomQKVToContextPluginDynamic', 'GroupNormalizationPlugin', 'RnRes2FullFusion_TRT', 'InstanceNormalization_TRT', 'pooling', 'copy']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'macnica_trt_plugins', 'macnica_trt_plugins']


In [13]:
namespace = 'macnica_trt_plugins'
macnica_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == namespace]
for c in macnica_creators:
    registry.register_creator(c, namespace)

In [14]:
def define_trt_plugin_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))
    
    ### Custom Pooling Layer with CUDA or cuDNN ###
    creator = registry.get_plugin_creator( \
        type='pooling', version='1', plugin_namespace='macnica_trt_plugins')
    sz = input_tensor.shape
    message = pooling_Message( \
        dims=sz, mode=trt_plugin_pb2.Maximum, window=[win_size, win_size], \
        stride=[stride, stride], impl=trt_plugin_pb2.CUDA)
    plg = creator.deserialize_plugin('pooling', message.SerializeToString())
    layer = network.add_plugin_v2(inputs=[input_tensor], plugin=plg)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [19]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine2(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        define_trt_plugin_network(network)
        #define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [20]:
net_dict = None
with build_engine2(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

In [21]:
pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [PluginV2Ext],LayerType.PLUGIN_V2,"(1, 3, 225, 225)","(1, 3, 112, 112)",


In [22]:
result = trt_outputs[0].reshape((3, output_size, output_size))
print(result)

[[[ 0.54893416  1.989144    1.989144   ...  1.3599222   1.6918298
    1.6918298 ]
  [ 0.15404245  0.5594461   1.059617   ...  2.011133    0.85054946
    0.66932017]
  [ 1.4160961   2.2378023   2.2378023  ...  1.4213791   0.39022008
    0.39022008]
  ...
  [ 1.8694088   0.92679745  0.67997384 ...  1.9961305   1.6812259
    0.97576284]
  [ 1.8694088   1.1513      1.1513     ...  1.9961305   1.7335786
    1.1152521 ]
  [ 1.0897418   1.4958568   1.4958568  ...  2.5603087   2.5603087
    1.5591258 ]]

 [[ 0.7773949   1.9087864   1.3096867  ...  2.6800668   2.4206667
    1.4081054 ]
  [ 1.6873859   2.2672944   1.0372812  ...  1.8019952   1.5644099
    1.5644099 ]
  [ 1.6463834   0.28332746  1.3296348  ...  0.824375    1.0844419
    0.9269205 ]
  ...
  [ 1.4914231   0.68575054  1.4856282  ...  1.7178365   1.7178365
    1.8403813 ]
  [ 1.4914231   0.94579667  0.92012715 ...  1.5533671   1.5533671
    1.8403813 ]
  [ 1.6168652   1.4979389   2.0956516  ...  0.57644784  0.9924499
    0.9924499 ]]

In [23]:
print(sum(abs(result.flatten() - reference.flatten())) / len(result.flatten()))

0.0
