# A Programm to Test The Pooling Custom Layer TensorRT Plugin

In [1]:
input_size = 225
win_size = 3
stride = 2
output_size = input_size // stride

In [2]:
import numpy as np
import torch

x = np.random.randn(3, input_size, input_size)
x = np.float32(x)
input_tensor = torch.tensor(x)
input_batch = input_tensor.unsqueeze(0)

In [3]:
input_batch.shape

torch.Size([1, 3, 225, 225])

In [4]:
input_batch

tensor([[[[-3.4300e-01, -1.2683e+00, -6.8166e-01,  ...,  2.3276e-01,
            6.8599e-01,  2.2802e-01],
          [-8.2657e-01, -8.7959e-01,  1.3487e-01,  ...,  5.9657e-01,
            6.2009e-01,  1.6774e+00],
          [ 1.3030e+00,  1.5474e+00,  1.2493e-01,  ..., -6.1770e-01,
           -5.5775e-01, -1.7098e+00],
          ...,
          [ 1.4101e+00, -6.1451e-01,  2.8023e-01,  ...,  1.0123e+00,
            1.3470e+00, -1.5766e-01],
          [ 7.9558e-01,  7.8580e-01,  1.5830e+00,  ..., -1.5515e-01,
           -5.8068e-02, -1.0698e-01],
          [-7.9047e-01, -1.4731e+00, -2.8884e-01,  ...,  2.3702e-01,
            8.4082e-01, -1.1182e+00]],

         [[-1.4607e+00, -6.1091e-01,  9.8892e-01,  ...,  1.8890e-01,
            9.9963e-01, -3.6593e-01],
          [-1.2771e+00, -1.4688e+00,  2.2299e+00,  ...,  4.9653e-01,
           -1.5223e+00, -7.3137e-02],
          [-1.8243e-01, -2.4507e+00,  1.7984e-01,  ...,  3.0217e-01,
            1.2201e+00,  4.3842e-01],
          ...,
     

In [5]:
def define_trt_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))

    # MaxPool2d
    layer = network.add_pooling( \
        input=input_tensor, type=trt.PoolingType.MAX, window_size=(win_size, win_size))
    layer.stride = (stride, stride)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [6]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network:
        builder.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        #define_trt_plugin_network(network)
        define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

In [7]:
import tensorrt as trt
import common

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
net_dict = None
with build_engine(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

In [8]:
import pandas as pd

pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [Pooling],LayerType.POOLING,"(1, 3, 225, 225)","(1, 3, 112, 112)","type=PoolingType.MAX wsize=(3, 3) stride=(2, 2..."


In [9]:
reference = trt_outputs[0].reshape((3, input_size // 2, input_size // 2))
print(reference)

[[[1.5473993  1.4226503  1.4226503  ... 1.2175965  1.1537505  1.6774298 ]
  [1.8176032  1.2469562  0.46860367 ... 1.0226729  1.3530318  1.3530318 ]
  [1.8176032  1.3608053  1.7681979  ... 1.0226729  1.5229927  1.0145942 ]
  ...
  [0.91066366 0.9392319  1.2113898  ... 0.8809383  0.6658045  1.5179024 ]
  [1.4100938  0.9392319  0.9562539  ... 0.91734457 1.0123339  1.3469784 ]
  [1.5830228  1.5830228  0.8718155  ... 2.4697561  1.0123339  1.3469784 ]]

 [[2.2298632  2.2298632  1.3579075  ... 0.69780236 0.8718312  1.2201344 ]
  [1.6716771  1.6716771  1.2598774  ... 1.7448574  1.0671046  1.2201344 ]
  [1.6716771  1.6716771  0.86154526 ... 1.7448574  0.93903005 0.84679675]
  ...
  [1.3825473  1.3825473  1.6213773  ... 1.6845309  1.5847297  1.5847297 ]
  [1.8224572  1.1258711  0.42752445 ... 0.8216315  1.4504476  0.8565542 ]
  [1.7298367  1.7298367  1.3572205  ... 0.6517587  1.4504476  1.0230354 ]]

 [[1.9161366  1.9161366  0.56086814 ... 1.7111505  1.7111505  0.9838451 ]
  [1.6373359  1.394148

In [10]:
import sys
import os

cur_path = %pwd
plugin_path = os.path.join(cur_path, 'plugin')
sys.path.append(plugin_path)
from trt_plugin_pb2 import copy_Message
from trt_plugin_pb2 import pooling_Message
import trt_plugin_pb2

In [11]:
import ctypes

lib_file = os.path.join(plugin_path, 'build', 'libPoolingPlugin.so')
lib = ctypes.CDLL(lib_file)

In [12]:
import tensorrt as trt

registry = trt.get_plugin_registry()
print([c.name for c in registry.plugin_creator_list])
print([c.plugin_namespace for c in registry.plugin_creator_list])

['RnRes2Br2bBr2c_TRT', 'RnRes2Br2bBr2c_TRT', 'RnRes2Br1Br2c_TRT', 'RnRes2Br1Br2c_TRT', 'CustomSkipLayerNormPluginDynamic', 'CustomEmbLayerNormPluginDynamic', 'CustomGeluPluginDynamic', 'CustomQKVToContextPluginDynamic', 'CustomFCPluginDynamic', 'SingleStepLSTMPlugin', 'pooling', 'copy']
['', '', '', '', '', '', '', '', '', '', 'macnica_trt_plugins', 'macnica_trt_plugins']


In [13]:
namespace = 'macnica_trt_plugins'
macnica_creators = [c for c in registry.plugin_creator_list if c.plugin_namespace == namespace]
for c in macnica_creators:
    registry.register_creator(c, namespace)

In [14]:
def define_trt_plugin_network(network):
    # Input
    input_tensor = network.add_input(name='input', dtype=trt.float32, shape=(1, 3, input_size, input_size))
    
    ### Custom Pooling Layer with CUDA or cuDNN ###
    creator = registry.get_plugin_creator( \
        type='pooling', version='1', plugin_namespace='macnica_trt_plugins')
    sz = input_tensor.shape
    message = pooling_Message( \
        dims=sz, mode=trt_plugin_pb2.Maximum, window=[win_size, win_size], \
        stride=[stride, stride], impl=trt_plugin_pb2.CUDA)
    plg = creator.deserialize_plugin('pooling', message.SerializeToString())
    layer = network.add_plugin_v2(inputs=[input_tensor], plugin=plg)

    # Output
    layer.get_output(0).name = 'output'
    network.mark_output(tensor=layer.get_output(0))

In [15]:
import trt_analyzer
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine2(logger):
    with trt.Builder(logger) as builder, builder.create_network(EXPLICIT_BATCH) as network:
        builder.max_workspace_size = 1 << 30
        # Define the TRT network using weights from the PyTorch model.
        define_trt_plugin_network(network)
        #define_trt_network(network)
        # Get network info
        global net_dict
        net_dict = trt_analyzer.network_dict(network)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

In [16]:
net_dict = None
with build_engine2(TRT_LOGGER) as engine:
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        inputs[0].host = input_batch.numpy()
        trt_outputs = common.do_inference_v2( \
            context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

In [17]:
pd.DataFrame(net_dict)

Unnamed: 0,Name,Type,Inputs,Outputs,Type Specific Params
0,(Unnamed Layer* 0) [PluginV2Ext],LayerType.PLUGIN_V2,"(1, 3, 225, 225)","(1, 3, 112, 112)",


In [18]:
result = trt_outputs[0].reshape((3, output_size, output_size))
print(result)

[[[1.5473993  1.4226503  1.4226503  ... 1.2175965  1.1537505  1.6774298 ]
  [1.8176032  1.2469562  0.46860367 ... 1.0226729  1.3530318  1.3530318 ]
  [1.8176032  1.3608053  1.7681979  ... 1.0226729  1.5229927  1.0145942 ]
  ...
  [0.91066366 0.9392319  1.2113898  ... 0.8809383  0.6658045  1.5179024 ]
  [1.4100938  0.9392319  0.9562539  ... 0.91734457 1.0123339  1.3469784 ]
  [1.5830228  1.5830228  0.8718155  ... 2.4697561  1.0123339  1.3469784 ]]

 [[2.2298632  2.2298632  1.3579075  ... 0.69780236 0.8718312  1.2201344 ]
  [1.6716771  1.6716771  1.2598774  ... 1.7448574  1.0671046  1.2201344 ]
  [1.6716771  1.6716771  0.86154526 ... 1.7448574  0.93903005 0.84679675]
  ...
  [1.3825473  1.3825473  1.6213773  ... 1.6845309  1.5847297  1.5847297 ]
  [1.8224572  1.1258711  0.42752445 ... 0.8216315  1.4504476  0.8565542 ]
  [1.7298367  1.7298367  1.3572205  ... 0.6517587  1.4504476  1.0230354 ]]

 [[1.9161366  1.9161366  0.56086814 ... 1.7111505  1.7111505  0.9838451 ]
  [1.6373359  1.394148

In [19]:
print(sum(abs(result.flatten() - reference.flatten())) / len(result.flatten()))

0.0
