## Deploy TensorRT engine
A notebook to test that the TensorRT engine can be deployed to a Triton Inference
Server. This also tests that inference can be performed.

In [1]:
import shutil
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory

import numpy as np
from tritonclient import http

In [2]:
MODEL_PATH = Path("")

### Start the triton server

In [3]:
temp_dir = TemporaryDirectory()
temp_model_dir = Path(temp_dir.name)
_ = shutil.copy(MODEL_PATH, temp_model_dir / "model.plan")

In [None]:
subprocess.run([
    "nvidia-docker",
    "run",
    "-dit",
    "--net=host",
    "--name",
    "vit-triton",
    "-v",
    f"{temp_model_dir}:/models/vit/1",
    "nvcr.io/nvidia/tritonserver:24.09-py3",
    "tritonserver",
    "--model-repository=/models",
    ])

### Run inference

In [5]:
client = http.InferenceServerClient("localhost:8000")

In [None]:
# Create input data
inputs = [http.InferInput("input", [32, 3, 256, 256], "FP32")]
inputs[0].set_data_from_numpy(np.random.rand(32, 3, 256, 256).astype(np.float32))

# Run inference
results = client.infer("vit", inputs)

# Get the output
output = results.as_numpy("output")

print(output.shape)

### Clean up

In [None]:
subprocess.run(["nvidia-docker", "rm", "-f", "vit-triton"])
_ = temp_dir.cleanup()