# A Programm to Test The Pooling Custom Layer TensorRT Plugin

In [1]:
input_size = 225
win_size = 3
stride = 2
output_size = input_size // stride

In [2]:
import numpy as np
import torch

x = np.random.randn(3, input_size, input_size)
x = np.float32(x)
input_tensor = torch.tensor(x)
input_batch = input_tensor.unsqueeze(0)

In [3]:
input_batch.shape

torch.Size([1, 3, 225, 225])

In [4]:
input_batch

tensor([[[[ 1.5237e-01,  1.2867e-03,  3.4907e-01,  ..., -4.2639e-01,
           -2.5910e+00,  6.5477e-01],
          [ 1.2469e+00,  9.9521e-01, -9.3283e-02,  ..., -7.1065e-02,
           -7.8114e-01,  1.3318e+00],
          [ 7.7094e-01,  7.2175e-01,  4.2063e-02,  ...,  6.2130e-01,
            2.0740e-01, -4.7347e-01],
          ...,
          [-1.4965e+00,  4.2837e-01,  8.5091e-01,  ..., -8.7340e-01,
           -8.1079e-01, -1.1424e+00],
          [ 4.1352e-01, -5.0900e-01, -1.5102e+00,  ..., -5.0868e-01,
           -1.4747e+00, -1.0031e+00],
          [-1.2912e+00,  9.9546e-01, -8.2836e-01,  ..., -2.7126e-01,
            9.4103e-02, -1.6816e+00]],

         [[-2.6476e+00,  8.3387e-01,  2.4967e-01,  ...,  4.2392e-01,
           -1.4385e+00,  1.9493e-01],
          [ 1.6934e+00, -2.3278e+00, -7.3203e-01,  ...,  6.7910e-01,
            2.7497e-01,  1.4299e+00],
          [ 4.8838e-02, -1.2762e+00, -8.2371e-01,  ...,  3.2309e-01,
            1.1854e-01,  8.3346e-01],
          ...,
     

In [5]:
def define_trt_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))

    # MaxPool2d
    layer = network.add_pooling_nd( \
        input=input_tensor, type=trt.PoolingType.MAX, window_size=(win_size, win_size))
    layer.stride_nd = (stride, stride)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [6]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        if hasattr(config, 'set_memory_pool_limit'):
            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        else:
            config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        #define_trt_plugin_network(network)
        define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [7]:
import tensorrt as trt
import common

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
net_dict = None
with build_engine(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

[06/27/2022-08:30:49] [TRT] [I] [MemUsageChange] Init CUDA: CPU +201, GPU +0, now: CPU 284, GPU 2776 (MiB)
[06/27/2022-08:30:53] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +140, GPU +132, now: CPU 444, GPU 2923 (MiB)
[06/27/2022-08:30:53] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 444, GPU 2923 (MiB)
[06/27/2022-08:30:53] [TRT] [V] Applying generic optimizations to the graph for inference.
[06/27/2022-08:30:53] [TRT] [V] Original: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After dead-layer removal: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After Myelin optimization: 1 layers
[06/27/2022-08:30:53] [TRT] [V] Applying ScaleNodes fusions.
[06/27/2022-08:30:53] [TRT] [V] After scale fusion: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After vertical fusions: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After dupe layer removal: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After final dead-layer removal: 1 layers
[06/27/2022-08:30:53] [TRT] [V] After tensor merging: 1 l

In [8]:
import pandas as pd

pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [Pooling],LayerType.POOLING,"(1, 3, 225, 225)","(1, 3, 112, 112)","type=PoolingType.MAX wsize=(3, 3) stride=(2, 2..."


In [9]:
reference = trt_outputs[0].reshape((3, input_size // 2, input_size // 2))
print(reference)

[[[1.2468637  0.9731418  1.7283603  ... 1.7039526  1.7803113  1.3318315 ]
  [0.87263244 1.6231679  1.7283603  ... 1.0964074  1.6265202  2.3499799 ]
  [0.87263244 0.8002932  0.8002932  ... 1.4108361  1.6265202  2.3499799 ]
  ...
  [1.8190268  2.0853243  2.0853243  ... 2.7980096  2.7980096  1.5694978 ]
  [0.85090864 2.0072181  2.0072181  ... 2.075235   1.7450305  0.03003781]
  [0.99545527 2.0072181  2.0072181  ... 2.075235   1.7450305  0.09410294]]

 [[1.6933644  1.4559153  0.5049344  ... 1.1002792  1.566453   1.4299002 ]
  [2.2559497  2.2559497  0.36667484 ... 1.1002792  1.566453   0.83345956]
  [2.2559497  2.2559497  1.0379118  ... 2.6732693  1.2995731  1.2995731 ]
  ...
  [1.3822354  1.1411262  1.017575   ... 1.4276408  1.6172732  1.8445979 ]
  [1.7493899  1.7493899  1.4588192  ... 1.4615191  1.6172732  1.0922401 ]
  [1.9097644  1.7493899  1.6592412  ... 1.4615191  1.2949623  0.6548751 ]]

 [[1.7477155  1.7477155  1.4374763  ... 1.4319489  1.9334582  1.9334582 ]
  [1.7477155  1.747715

In [10]:
import sys
import os

cur_path = %pwd
plugin_path = os.path.join(cur_path, 'plugin')
sys.path.append(plugin_path)
from trt_plugin_pb2 import copy_Message
from trt_plugin_pb2 import pooling_Message
import trt_plugin_pb2

In [11]:
import ctypes

lib_file = os.path.join(plugin_path, 'build', 'libPoolingPlugin.so')
lib = ctypes.CDLL(lib_file)

In [12]:
import tensorrt as trt

registry = trt.get_plugin_registry()
print([c.name for c in registry.plugin_creator_list])
print([c.plugin_namespace for c in registry.plugin_creator_list])

['CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'CustomQKVToContextPluginDynamic', 'RnRes2Br1Br2c_TRT', 'RnRes2Br1Br2c_TRT', 'RnRes2FullFusion_TRT', 'SmallTileGEMM_TRT', 'RNNTEncoderPlugin', 'DLRM_BOTTOM_MLP_TRT', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'CustomSkipLayerNormPluginDynamic', 'SingleStepLSTMPlugin', 'RnRes2Br2bBr2c_TRT', 'RnRes2Br2bBr2c_TRT', 'CustomGeluPluginDynamic', 'CustomFCPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'pooling', 'copy']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'macnica_trt_plugins', 'macnica_trt_plugins']


In [13]:
namespace = 'macnica_trt_plugins'
macnica_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == namespace]
for c in macnica_creators:
    registry.register_creator(c, namespace)

In [14]:
def define_trt_plugin_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))
    
    ### Custom Pooling Layer with CUDA or cuDNN ###
    creator = registry.get_plugin_creator( \
        type='pooling', version='1', plugin_namespace='macnica_trt_plugins')
    sz = input_tensor.shape
    message = pooling_Message( \
        dims=sz, mode=trt_plugin_pb2.Maximum, window=[win_size, win_size], \
        stride=[stride, stride], impl=trt_plugin_pb2.CUDA)
    plg = creator.deserialize_plugin('pooling', message.SerializeToString())
    layer = network.add_plugin_v2(inputs=[input_tensor], plugin=plg)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [15]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine2(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config, trt.Runtime(TRT_LOGGER) as runtime:
        if hasattr(config, 'set_memory_pool_limit'):
            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        else:
            config.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        define_trt_plugin_network(network)
        #define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        return engine

In [16]:
net_dict = None
with build_engine2(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

[06/27/2022-08:30:56] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 793, GPU 3264 (MiB)
[06/27/2022-08:30:56] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 793, GPU 3264 (MiB)
[06/27/2022-08:30:56] [TRT] [V] Applying generic optimizations to the graph for inference.
[06/27/2022-08:30:56] [TRT] [V] Original: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After dead-layer removal: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After Myelin optimization: 1 layers
[06/27/2022-08:30:56] [TRT] [V] Applying ScaleNodes fusions.
[06/27/2022-08:30:56] [TRT] [V] After scale fusion: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After vertical fusions: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After dupe layer removal: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After final dead-layer removal: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After tensor merging: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After slice removal: 1 layers
[06/27/2022-08:30:56] [TRT] [V] After concat removal: 1 layer

In [17]:
pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [PluginV2Ext],LayerType.PLUGIN_V2,"(1, 3, 225, 225)","(1, 3, 112, 112)",


In [18]:
result = trt_outputs[0].reshape((3, output_size, output_size))
print(result)

[[[1.2468637  0.9731418  1.7283603  ... 1.7039526  1.7803113  1.3318315 ]
  [0.87263244 1.6231679  1.7283603  ... 1.0964074  1.6265202  2.3499799 ]
  [0.87263244 0.8002932  0.8002932  ... 1.4108361  1.6265202  2.3499799 ]
  ...
  [1.8190268  2.0853243  2.0853243  ... 2.7980096  2.7980096  1.5694978 ]
  [0.85090864 2.0072181  2.0072181  ... 2.075235   1.7450305  0.03003781]
  [0.99545527 2.0072181  2.0072181  ... 2.075235   1.7450305  0.09410294]]

 [[1.6933644  1.4559153  0.5049344  ... 1.1002792  1.566453   1.4299002 ]
  [2.2559497  2.2559497  0.36667484 ... 1.1002792  1.566453   0.83345956]
  [2.2559497  2.2559497  1.0379118  ... 2.6732693  1.2995731  1.2995731 ]
  ...
  [1.3822354  1.1411262  1.017575   ... 1.4276408  1.6172732  1.8445979 ]
  [1.7493899  1.7493899  1.4588192  ... 1.4615191  1.6172732  1.0922401 ]
  [1.9097644  1.7493899  1.6592412  ... 1.4615191  1.2949623  0.6548751 ]]

 [[1.7477155  1.7477155  1.4374763  ... 1.4319489  1.9334582  1.9334582 ]
  [1.7477155  1.747715

In [19]:
print(sum(abs(result.flatten() - reference.flatten())) / len(result.flatten()))

0.0
