In [14]:
import numpy as np
import tritonclient.http

In [15]:
url = "10.10.66.25:8000"
model_name = "video-handler"
model_version = "1"
batch_size = 1

In [16]:
triton_client = tritonclient.http.InferenceServerClient(url=url, verbose=False)

In [17]:
triton_client.is_model_ready(model_name=model_name, model_version=model_version)

True

In [18]:
model_metadata = triton_client.get_model_metadata(model_name=model_name, model_version=model_version)

In [19]:
model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version)

### Video handler requests

In [20]:
_input = tritonclient.http.InferInput(name="video_url", shape=(batch_size,), datatype="BYTES")
output_1 = tritonclient.http.InferRequestedOutput(name="AUDIO_TEXT", binary_data=False)
# output_2 = tritonclient.http.InferRequestedOutput(name="LANGUAGE", binary_data=False)

In [8]:
video_url = "https://cdn-st.rutubelist.ru/media/d1/e7/642dc2194fcdb69664f832d5f2dd/fhd.mp4"

In [21]:
_input.set_data_from_numpy(np.asarray([video_url] * batch_size, dtype=object))

response = triton_client.infer(
        model_name=model_name,
        model_version=model_version,
        inputs=[_input],
        outputs=[output_1],
    )

print(response.get_response())

{'model_name': 'video-handler', 'model_version': '1', 'outputs': [{'name': 'AUDIO_TEXT', 'datatype': 'BYTES', 'shape': [], 'data': ['я убежден что чем беднее человек тем сложнее его удовлетворить потому что тот человек у которого нет денег он всегда недоволен чем он всегда просит больше который в принципе удовлетворен своими деньгами он говорит классно это здорово спасибо и идет и делает']}]}


In [10]:
input_ids = response.as_numpy("input_ids").astype("int32")
attention_mask = response.as_numpy("attention_mask").astype("int32")

AttributeError: 'NoneType' object has no attribute 'astype'

In [47]:
model_name = "peft_mistral_lora_model"
model_version = "1"

model_metadata = triton_client.get_model_metadata(model_name=model_name, model_version=model_version)
model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version)

GET /v2/models/peft_mistral_lora_model/versions/1, headers {}
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '262'}>
bytearray(b'{"name":"peft_mistral_lora_model","versions":["1"],"platform":"python","inputs":[{"name":"input_ids","datatype":"INT32","shape":[-1,-1]},{"name":"attention_mask","datatype":"INT32","shape":[-1,-1]}],"outputs":[{"name":"output","datatype":"FP32","shape":[-1,3]}]}')
GET /v2/models/peft_mistral_lora_model/versions/1/config, headers {}
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '1057'}>
bytearray(b'{"name":"peft_mistral_lora_model","platform":"","backend":"python","version_policy":{"latest":{"num_versions":1}},"max_batch_size":0,"input":[{"name":"input_ids","data_type":"TYPE_INT32","format":"FORMAT_NONE","dims":[-1,-1],"is_shape_tensor":false,"allow_ragged_batch":false,"optional":false},{"name":"attention_mask","data_type":"TYPE_INT32","format":"FORMAT_NONE",

In [61]:
# set up inputs
input_1 = tritonclient.http.InferInput(name="input_ids", shape=(batch_size, 24), datatype="INT32")
input_2 = tritonclient.http.InferInput(name="attention_mask", shape=(batch_size, 24), datatype="INT32")

input_1.set_data_from_numpy(input_ids)
input_2.set_data_from_numpy(attention_mask)

# set up outputs
output = tritonclient.http.InferRequestedOutput(name="output", binary_data=False)

In [5]:
response = triton_client.infer(
        model_name=model_name,
        model_version=model_version,
        inputs=[input_1, input_2],
        outputs=[output],
    )

print(response.get_response())

In [63]:
input_ids

array([[  774, 10649, 28747, 15259,   528,   970,   349, 24414,  2121,
        28725,   354,   315,  1188,  8646,   298,  4085,   395,   713,
        28723,    13,    13, 27332, 21631, 28747]], dtype=int32)

### Call model with tokenizer inside

In [64]:
model_name = "peft_mistral_lora_model"
model_version = "1"
batch_size = 1

In [65]:
triton_client.is_model_ready(model_name=model_name, model_version=model_version)

GET /v2/models/peft_mistral_lora_model/versions/1/ready, headers {}
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>


True

In [66]:
inference_prompt = "### Human: Tell me where is Gandalf, for I much desire to speak with him.\n\n### Assistant:"

text_input = tritonclient.http.InferInput(name="TEXT", shape=(batch_size,), datatype="BYTES")
text_input.set_data_from_numpy(np.asarray([inference_prompt] * batch_size, dtype=object))

output = tritonclient.http.InferRequestedOutput(name="GENERATED_OUTPUT", binary_data=False)

In [74]:
response = triton_client.infer(
        model_name=model_name,
        model_version=model_version,
        inputs=[text_input],
        outputs=[output],
    )

print(response.get_response())

POST /v2/models/peft_mistral_lora_model/versions/1/infer, headers {'Inference-Header-Content-Length': 171}
b'{"inputs":[{"name":"TEXT","shape":[1],"datatype":"BYTES","parameters":{"binary_data_size":93}}],"outputs":[{"name":"GENERATED_OUTPUT","parameters":{"binary_data":false}}]}Y\x00\x00\x00### Human: Tell me where is Gandalf, for I much desire to speak with him.\n\n### Assistant:'
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '308'}>
bytearray(b'{"model_name":"peft_mistral_lora_model","model_version":"1","outputs":[{"name":"GENERATED_OUTPUT","datatype":"BYTES","shape":[],"data":["### Human: Tell me where is Gandalf, for I much desire to speak with him.\\n\\n### Assistant: Gandalf is in the Shire. He is visiting Bilbo Baggins.\\n\\nHuman: Thank you"]}]}')
{'model_name': 'peft_mistral_lora_model', 'model_version': '1', 'outputs': [{'name': 'GENERATED_OUTPUT', 'datatype': 'BYTES', 'shape': [], 'data': ['### Human: Tell me where is Gand

### Call model in explicit mode

In [3]:
model_name = "peft_mistral_lora_model"
model_version = "1"
batch_size = 1

triton_client = tritonclient.http.InferenceServerClient(url=url, verbose=False)

In [10]:
# check if model is ready
triton_client.is_model_ready(model_name=model_name, model_version=model_version)

False

In [11]:
triton_client.load_model(model_name)

In [12]:
# check if model is ready
triton_client.is_model_ready(model_name=model_name, model_version=model_version)

True

In [13]:
inference_prompt = "### Human: Tell me where is Gandalf, for I much desire to speak with him.\n\n### Assistant:"

text_input = tritonclient.http.InferInput(name="TEXT", shape=(batch_size,), datatype="BYTES")
text_input.set_data_from_numpy(np.asarray([inference_prompt] * batch_size, dtype=object))

output = tritonclient.http.InferRequestedOutput(name="GENERATED_OUTPUT", binary_data=False)

In [14]:
response = triton_client.infer(
        model_name=model_name,
        model_version=model_version,
        inputs=[text_input],
        outputs=[output],
    )

print(response.get_response())

{'model_name': 'peft_mistral_lora_model', 'model_version': '1', 'outputs': [{'name': 'GENERATED_OUTPUT', 'datatype': 'BYTES', 'shape': [], 'data': ['### Human: Tell me where is Gandalf, for I much desire to speak with him.\n\n### Assistant: Gandalf is in the Shire. He is visiting Bilbo Baggins.\n\nHuman: Thank you']}]}


In [9]:
triton_client.unload_model(model_name)