In [None]:
import sys
import numpy as np
import requests
from mlserver.types import InferenceRequest, InferenceErrorResponse, InferenceResponse, RequestInput
from mlserver.codecs import StringCodec, NumpyCodec, StringRequestCodec
import json

## Building Deployment

In [None]:
# Since we're deploying several versions of spacy we're going to create a folder for each with the relevant artifacts
# Build will copy the folders out of the src directory and create the conda tar file to run our models on Triton
!python utils build

# Sklearn download example
# !python utils.py download -spcy en_core_web_sm -o ../dist/model/spacy_ner_sm/1/artifacts --model-version 3.7.0

# Download HuggingFace repo example
# !python utils.py download -hf openai/clip-vit-large-patch14 -o ../dist/models/clip/1/artifacts

## Starting Triton and loading models

Here we'll start up a triton server with docker compose up, since we use the `--model-control-mode=explicit` option in the server startup Triton will not try to automatically load all models in it's model repo.  

This is useful as it will let us choose when to load and unload models for testing, dramatically speeding up development and debugging.

In [None]:
# Starts CPU version
!docker compose up -d

### List / Load / Unload a model

In [None]:
def get_available():
    endpoint = f"http://localhost:8080/v2/repository/models/index"
    return requests.post(endpoint)

def get_server_health():
    endpoints = {"ready": "http://localhost:8080/v2/health/ready", "live": "http://localhost:8080/v2/health/live"}
    status = {
        "live": False,
        "ready" : False
    }
    for endpoint, url in endpoints.items():
        if requests.get(url).status_code == 200:
            status[endpoint] = True
            continue

    return status

def unload_model(model_name):
    endpoint = f"http://localhost:8080/v2/repository/models/{model_name}/unload"
    return requests.post(endpoint)

def load_model(model_name):
    endpoint = f"http://localhost:8080/v2/repository/models/{model_name}/load"
    return requests.post(endpoint)

def get_model_stats(model_name):
    endpoint = f"http://localhost:8080/v2/models/{model_name}/stats"
    return requests.get(endpoint)

def run_inference(model_name, _payload):
    encoded_input = RequestInput(
        name="INPUT__0", 
        shape=[len(_payload)], 
        datatype="BYTES", 
        parameters={"content_type": "str"}, 
        data=_payload
    )

    inference_request = InferenceRequest(
                            id=1,
                            inputs=[encoded_input], 
                            parameters={"payload_type": "text"}
                        )

    print('----- JSON dump of the V2 request -----')
    print(json.dumps(inference_request.dict()))
    
    endpoint = f"http://localhost:8080/v2/models/{model_name}/infer"
    return requests.post(endpoint, json=inference_request.dict())

In [None]:
get_server_health()

In [None]:
print(get_available().text)

In [None]:
load_model("{{cookiecutter.model_name}}")
print(get_available().text)

## Inference request for text embedding

In [None]:

payload = ["Bill ate Robert and had stuff to do in Washington"]

response = run_inference("{{cookiecutter.model_name}}", payload)

print('\n----- Server Unparsed Response -----')
print(response.text)
inf_response = InferenceResponse.parse_raw(response.text)

print('\n----- Server Parsed Response -----')
print(f'Model Name: {inf_response.model_name}')
print(f'Model Version: {inf_response.model_version}')
parsed_resp = StringCodec().decode_output(inf_response.outputs[0])
print(f'Response: {parsed_resp}')

In [None]:
print(get_model_stats("{{cookiecutter.model_name}}").json())

In [None]:
unload_model("{{cookiecutter.model_name}}")
print(get_available().text)