In [1]:
import os
import time
import numpy as np
import wurlitzer

import trtlab

# this allows us to capture stdout and stderr from the backend c++ infer-runtime
display_output = wurlitzer.sys_pipes

In [2]:
!/work/models/setup.py

# Local Inference Setup

In [2]:
with display_output():
    manager = trtlab.InferenceManager(max_exec_concurrency=4)

I0204 22:01:27.543411   925 inference_manager.cc:64] -- Initialzing TensorRT Resource Manager --
I0204 22:01:27.543426   925 inference_manager.cc:65] Maximum Execution Concurrency: 4
I0204 22:01:27.543429   925 inference_manager.cc:66] Maximum Copy Concurrency: 8


In [3]:
with display_output():
    manager.register_tensorrt_engine("rn50-b1", "/work/models/ResNet-50-b1-fp16.engine")
    manager.register_tensorrt_engine("rn50-b8", "/work/models/ResNet-50-b8-fp16.engine")

I0204 22:01:30.164453   925 model.cc:91] Binding: data; isInput: true; dtype size: 4; bytes per batch item: 602112
I0204 22:01:30.164479   925 model.cc:91] Binding: prob; isInput: false; dtype size: 4; bytes per batch item: 4000
I0204 22:01:30.169529   925 inference_manager.cc:149] -- Registering Model: rn50-b1 --
I0204 22:01:30.169546   925 inference_manager.cc:150] Input/Output Tensors require 591.9 KiB
I0204 22:01:30.169550   925 inference_manager.cc:151] Execution Activations require 5.7 MiB
I0204 22:01:30.169554   925 inference_manager.cc:155] Weights require 75.8 MiB
I0204 22:01:30.223752   925 model.cc:91] Binding: data; isInput: true; dtype size: 4; bytes per batch item: 602112
I0204 22:01:30.223776   925 model.cc:91] Binding: prob; isInput: false; dtype size: 4; bytes per batch item: 4000
I0204 22:01:30.227011   925 inference_manager.cc:149] -- Registering Model: rn50-b8 --
I0204 22:01:30.227035   925 inference_manager.cc:150] Input/Output Tensors require 4.6 MiB
I0204 22:01:3

In [4]:
with display_output():
    manager.update_resources()

I0204 22:01:31.025523   925 inference_manager.cc:194] -- Allocating TensorRT Resources --
I0204 22:01:31.025539   925 inference_manager.cc:195] Creating 4 TensorRT execution tokens.
I0204 22:01:31.025542   925 inference_manager.cc:196] Creating a Pool of 8 Host/Device Memory Stacks
I0204 22:01:31.025550   925 inference_manager.cc:197] Each Host Stack contains 4.7 MiB
I0204 22:01:31.025554   925 inference_manager.cc:198] Each Device Stack contains 4.8 MiB
I0204 22:01:31.025559   925 inference_manager.cc:199] Total GPU Memory: 197.5 MiB


# Local Inference Properties

In [5]:
b1 = manager.infer_runner("rn50-b1")
b8 = manager.infer_runner("rn50-b8")

In [6]:
b1.input_bindings()

{'data': {'dtype': dtype('float32'), 'shape': [3, 224, 224]}}

In [7]:
b1.max_batch_size()

1

In [8]:
b8.input_bindings()

{'data': {'dtype': dtype('float32'), 'shape': [3, 224, 224]}}

In [9]:
b8.max_batch_size()

8

In [10]:
def max_batch_size_shape(x, input='data'):
    batch = [x.max_batch_size()]
    batch.extend(x.input_bindings()[input]['shape'])
    return batch

In [12]:
max_batch_size_shape(b8)

[8, 3, 224, 224]

# Compute

Here we launch two async inferences with two different TensorRT engines, one built for batch1, the other for batch8.  While these are the same ResNet-50 models, they could be any two unique TensorRT engines.

Note: for this example the weights of the model and the input tensors are all random values.

In [18]:
futures = [model.infer(data=np.random.random_sample(max_batch_size_shape(model))) for model in [b1, b8]]

In [19]:
# free to do other work while inference is being computed

In [20]:
results = [f.get() for f in futures]

In [26]:
for result in results:
    for output, tensor in result.items():
        print("{} binding has shape: {}".format(output, tensor.shape))

prob binding has shape: (1, 1000, 1, 1)
prob binding has shape: (8, 1000, 1, 1)
