In [None]:
# Requirments 
#!cp -r /models .
#pip install tritonclient['all']
#pip install tqdm

# Deploy models on ovhai using Triton
In this section, we will use the ovhai tutorial to demonstrate how to deploy machine learning models on the ovhai cloud using Triton. 
You must first import the same storage container from which we exported part one models (onnx/jit/torch), and then configure the config files.

In [3]:
configuration = """
name: "pytorch-model-gpu"
platform: "pytorch_libtorch"
max_batch_size: 32
instance_group [
    {
      count: 1
      kind: KIND_GPU
    }
  ]
input [
 {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 224, 224 ]
  }
]
output {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1000 ]
  }
"""

with open('models/pytorch-model-gpu/config.pbtxt', 'w') as file:
    file.write(configuration)

In [4]:
configuration = """
name: "onnx-model-gpu"
platform: "onnxruntime_onnx"
max_batch_size: 32
instance_group [
    {
      count: 1
      kind: KIND_GPU
    }
  ]
input [
 {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 224, 224 ]
  }
]
output {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1000 ]
  }
"""

with open('models/onnx-model-gpu/config.pbtxt', 'w') as file:
    file.write(configuration)

In [6]:
configuration = """
name: "tensorrt-model"
platform: "tensorrt_plan"
max_batch_size: 32
instance_group [
    {
      count: 1
      kind: KIND_GPU
    }
  ]
input [
 {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 224, 224 ]
  }
]
output {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1000 ]
  }
"""

with open('models/tensorrt-model/config.pbtxt', 'w') as file:
    file.write(configuration)

### Launch the Triton server


In [None]:
!tritonserver --model-repository=/workspace/models/

### Send requests using tritonclient

In [10]:
# Import the required libraries  and initialize Triton variables. 
import os
import json
import argparse
import numpy as np
import tritonhttpclient
import tritongrpcclient
import time
from tqdm import tqdm

#  Set according to your config, default is localhost (http_url= 'localhost:8000' / grpc_url = 'localhost:8001')
http_url = 'localhost:8000'
grpc_url = 'localhost:8001'
verbose = False
concurrency = 100
model_version = '1'
batch_size = 1
triton_http_client = tritonhttpclient.InferenceServerClient(url=http_url, verbose=verbose)
triton_grpc_client = tritongrpcclient.InferenceServerClient(url=grpc_url, verbose=verbose)

input_dtype = 'FP32'
input_name = 'input'
input_shape = (1, 3, 224, 224)
output_name = 'output'

input0 = tritonhttpclient.InferInput(input_name, input_shape, input_dtype)
dummy_data = np.ones(shape=input_shape, dtype=np.float32)
input0.set_data_from_numpy(dummy_data, binary_data=True)
output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True)

In [11]:
# run ONNX inferences
model_name = 'onnx-model-gpu'
requests = []
request_count = 1000

start_time = time.time()
for i in tqdm(range(request_count)):
    requests.append(triton_http_client.infer(model_name, model_version=model_version, inputs=[input0], outputs=[output]))
end_time = time.time()

print(model_name)
print('Average Latency: ~{} seconds'.format((end_time - start_time) / request_count))
print('Average Throughput: ~{} examples / second'.format(batch_size * request_count / (end_time - start_time)))

100%|██████████| 1000/1000 [00:09<00:00, 104.92it/s]

onnx-model-gpu
Average Latency: ~0.009536562204360962 seconds
Average Throughput: ~104.85958971071476 examples / second





In [13]:
# run TorchScript inferences
model_name = 'tensorrt-model'
requests = []
request_count = 1000

start_time = time.time()
for i in tqdm(range(request_count)):
    requests.append(triton_http_client.infer(model_name, model_version=model_version, inputs=[input0], outputs=[output]))
end_time = time.time()

print(model_name)
print('Average Latency: ~{} seconds'.format((end_time - start_time) / request_count))
print('Average Throughput: ~{} examples / second'.format(batch_size * request_count / (end_time - start_time)))

100%|██████████| 1000/1000 [00:06<00:00, 157.77it/s]

tensorrt-model
Average Latency: ~0.006342987060546875 seconds
Average Throughput: ~157.65442849788548 examples / second





In [16]:
# Import the required libraries  and initialize Triton variables. 
import os
import json
import argparse
import numpy as np
import tritonhttpclient
import tritongrpcclient
import time
from tqdm import tqdm

#  Set according to your config, default is localhost (http_url= 'localhost:8000' / grpc_url = 'localhost:8001')
http_url = 'localhost:8000'
grpc_url = 'localhost:8001'
verbose = False
concurrency = 100
model_version = '1'
batch_size = 1
triton_http_client = tritonhttpclient.InferenceServerClient(url=http_url, verbose=verbose)
triton_grpc_client = tritongrpcclient.InferenceServerClient(url=grpc_url, verbose=verbose)

input_dtype = 'FP32'
input_name = 'input__0'
input_shape = (1, 3, 224, 224)
output_name = 'output__0'

input0 = tritonhttpclient.InferInput(input_name, input_shape, input_dtype)
dummy_data = np.ones(shape=input_shape, dtype=np.float32)
input0.set_data_from_numpy(dummy_data, binary_data=True)
output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True)

# run ONNX inferences
model_name = 'pytorch-model-gpu'
requests = []
request_count = 1000

start_time = time.time()
for i in tqdm(range(request_count)):
    requests.append(triton_http_client.infer(model_name, model_version=model_version, inputs=[input0], outputs=[output]))
end_time = time.time()

print(model_name)
print('Average Latency: ~{} seconds'.format((end_time - start_time) / request_count))
print('Average Throughput: ~{} examples / second'.format(batch_size * request_count / (end_time - start_time)))

100%|██████████| 1000/1000 [00:17<00:00, 58.57it/s]

pytorch-model-gpu
Average Latency: ~0.0170787250995636 seconds
Average Throughput: ~58.55237988610475 examples / second





### Launch the Flask server 
Launch the server from a terminal using 
* python flask_app/main.py

### Benchmark the Flask server

In [None]:
#run simple Flask Server
import requests

data = {'arr': dummy_data.tolist()}
response = requests.post("http://127.0.0.1:5000/predict", json=data)

start_time = time.time()
request_count = 1000

for i in tqdm(range(request_count)):
    response = requests.post("http://127.0.0.1:5000/predict", json=data)
end_time = time.time()

print('simple flask')
print('Average Latency: ~{} seconds'.format((end_time - start_time) / request_count))
print('Average Throughput: ~{} examples / second'.format(batch_size * request_count / (end_time - start_time)))

### Run Perf Analyzer

In [7]:
!wget https://github.com/triton-inference-server/server/releases/download/v2.19.0/v2.19.0_ubuntu2004.clients.tar.gz
!tar -xvf 
!./bin/perf_analyzer -m onnx-model-gpu
!./bin/perf_analyzer -m pytorch-model-gpu
!./bin/perf_analyzer -m tensorrt-model

/bin/bash: ./bin/perf_analyzer: No such file or directory
/bin/bash: ./bin/perf_analyzer: No such file or directory
E0319 13:43:58.253928372 1656448 backup_poller.cc:133]       Run client channel backup poller: {"created":"@1647697438.253808720","description":"pollset_work","file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":320,"referenced_errors":[{"created":"@1647697438.253800479","description":"Bad file descriptor","errno":9,"file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":950,"os_error":"Bad file descriptor","syscall":"epoll_wait"}]}
/bin/bash: ./bin/perf_analyzer: No such file or directory


### Request Triton prometheus metrics

In [None]:
!curl -v 127.0.0.1:8002/metrics