In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Online Prediction Dedicated Endpint

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_dedicated_endpoint.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fget_started_with_dedicated_endpoint.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/get_started_with_dedicated_endpoint.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_dedicated_endpoint.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

A dedicated public endpoint is a public endpoint for online prediction. It offers the following benefits:

* Dedicated networking: When you send a prediction request to a dedicated public endpoint, it is isolated from other users' traffic.
* Optimized network latency
* Larger payload support: Up to 10 MB.
* Longer request timeouts: Configurable up to 1 hour.
* Generative AI-ready: Streaming and gRPC are supported.

## Get started

### Install Vertex AI SDK for Python and other required packages

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Set project id to be the current project ID.

PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Create GCS Bucket
BUCKET_URI = "gs://your-bucket-name-unique"  # @param {type:"string"}
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

## Create Dedicated Endpoint and Deploy Model

### Create Dedicated Endpoint
When creating endpoint, set `dedicated_endpoint_enabled` to True.

In [None]:
endpoint = aiplatform.Endpoint.create(
    display_name="test-dedicated-endpoint",
    dedicated_endpoint_enabled=True,
)

### Prepare Test Models

We prepared a tensorflow test model, feel free to use your own models.

In [None]:
! pip freeze | grep google-cloud-aiplatform

In [None]:
# List all your models
for my_model in aiplatform.Model.list():
    print(my_model.display_name)
    print(my_model.gca_resource.name)

# If you want to use an existing model, use the resource id
# model = aiplatform.Model('projects/12345/locations/us-central1/models/456789')

In [None]:
# @title Upload a new tensorflow model
# @markdown You can skip this if use an existing model.

# TF Model
DISPLAY_NAME = "tensorflow model"  # @param {type:"string"}
ARTIFACT_URI = BUCKET_URI + "/tensorflow"
IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-12:latest"

# Copy Test Models to the Bucket
! gsutil -m cp -r "gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/tensorflow/*" {ARTIFACT_URI}

model = aiplatform.Model.upload(
    display_name=DISPLAY_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=IMAGE_URI,
)

#### Deploy Model

Deploy the model to the endpoint. If it is your first model with dedicated endpoint, it will take ~30 min.

In [None]:
endpoint.deploy(model=model, traffic_percentage=100, machine_type="e2-standard-8")

## Make Predictions

Dedicated endpoint cannot be accessed with the shared `aiplatform.googleapis.com` DNS. \
Instead, it has its own dedicated DNS
```
DEDICATED_DNS = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1beta1/{endpoint.resource_name}"
# or
DEDICATED_DNS = f"https://{ENDPOINT_ID}.{LOCATION}-{PROJECT_NUMBER}.prediction.vertexai.goog"
```

Python SDK has been integrated to support dedicated endpoint using a flag `use_dedicated_endpoint`.

Alternatively, you can send HTTP/GRPC request directly to this DNS in any language you prefer.

In [None]:
# @title Predict
# @markdown You can use client library.

use_python_sdk = True # @param {type:"boolean"}
# @markdown response = my_endpoint.predict( \
# @markdown &nbsp;&ensp; instances=[{"feat_1":val_1, "feat_2":val_2}]}, \
# @markdown &nbsp;&ensp; headers = {'Content-Type':'application/json'}, \
# @markdown &nbsp;&ensp; dedicated_endpoint=True, \
# @markdown )

if use_python_sdk:
    instances = [
        {"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]},
        {"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]}
    ]
    response = endpoint.predict(instances=instances, use_dedicated_endpoint=True)
    print(response)

# @markdown You can also make HTTP/GRPC request directly.

use_http = True # @param {type:"boolean"}
# @markdown POST request to the following URL:\
# @markdown `https://ENDPOINT_ID.us-central1-PROJECT_NUMBER.prediction.vertexai.goog/v1/projects/PROJECT_NUMBER/locations/LOCATION/endpoints/ENDPOINT_ID:predict`

if use_http:
    request_path = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1/{endpoint.resource_name}:predict"
    ! curl {request_path} -X POST  -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" -d \
    '{{ \
      "instances": [ \
        {{"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]}}, \
        {{"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]}} \
      ] \
    }}'

In [None]:
# @title Raw Predict
# @markdown You can use client library.

use_python_sdk = True # @param {type:"boolean"}
# @markdown response = my_endpoint.raw_predict( \
# @markdown &nbsp;&ensp;&nbsp;&ensp;body = b'{"instances":[{"feat_1":val_1, "feat_2":val_2}]}', \
# @markdown &nbsp;&ensp;&nbsp;&ensp;headers = {'Content-Type':'application/json'}, \
# @markdown &nbsp;&ensp;&nbsp;&ensp;dedicated_endpoint=True, \
# @markdown ) \
# @markdown status_code = response.status_code \
# @markdown results = json.dumps(response.text)

if use_python_sdk:
    body = b'{ \
      "instances": [ \
        {"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]}, \
        {"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]} \
      ]\
    }'
    import os
    token = !gcloud auth print-access-token
    token = token[0]
    headers = {"content-type": "application/json", "Authorization": f"Bearer {token}"}
    response = endpoint.raw_predict(body=body, headers=headers, use_dedicated_endpoint=True)
    # print(response.status_code)
    print(response.text)

# @markdown You can also make HTTP/GRPC request directly.

use_http = True # @param {type:"boolean"}
# @markdown POST request to the following URL:\
# @markdown `https://ENDPOINT_ID.us-central1-PROJECT_NUMBER.prediction.vertexai.goog/v1/projects/PROJECT_NUMBER/locations/LOCATION/endpoints/ENDPOINT_ID:rawPredict`

if use_http:
    request_path = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1/{endpoint.resource_name}:rawPredict"
    ! curl {request_path} -X POST  -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" -d \
    '{{ \
      "instances": [ \
        {{"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]}}, \
        {{"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]}} \
      ] \
    }}'

In [None]:
# @title Stream Raw Predict
# @markdown You can use client library.

use_python_sdk = True # @param {type:"boolean"}
# @markdown for stream_response in my_endpoint.stream_raw_predict( \
# @markdown &nbsp;&ensp;&nbsp;&ensp;body = b'{"instances":[{"feat_1":val_1, "feat_2":val_2}]}', \
# @markdown &nbsp;&ensp;&nbsp;&ensp;headers = {'Content-Type':'application/json'}, \
# @markdown &nbsp;&ensp;&nbsp;&ensp;use_dedicated_endpoint=True, \
# @markdown ): \
# @markdown &nbsp;&ensp;&nbsp;&ensp;status_code = response.status_code \
# @markdown &nbsp;&ensp;&nbsp;&ensp;stream_result = json.dumps(response.text)

if use_python_sdk:
    body = b'{ \
      "instances": [ \
        {"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]}, \
        {"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]} \
      ]\
    }'
    import os
    token = !gcloud auth print-access-token
    token = token[0]
    headers = {"content-type": "application/json", "Authorization": f"Bearer {token}"}
    for stream_response in endpoint.raw_predict(body=body, headers=headers, use_dedicated_endpoint=True):
        # print(response.status_code)
        print(response.text)

# @markdown You can also make HTTP/GRPC request directly.

use_http = True # @param {type:"boolean"}
# @markdown POST request to the following URL:\
# @markdown `https://ENDPOINT_ID.us-central1-PROJECT_NUMBER.prediction.vertexai.goog/v1/projects/PROJECT_NUMBER/locations/LOCATION/endpoints/ENDPOINT_ID:streamRawPredict`

if use_http:
    request_path = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1/{endpoint.resource_name}:streamRawPredict"
    ! curl {request_path} -X POST  -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" -d \
    '{{ \
      "instances": [ \
        {{"dense_input": [14.0, 7.0, 2545.461893666405, 54.2, 48.5, 0.0, 61.3, 0.0, 0.0, 0.0]}}, \
        {{"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]}} \
      ] \
    }}'

### Chat Completion (Model Garden Only)
**You can use OpenAI client library to do chat completion.**

```
client = openai.OpenAI(base_url=DEDICATED_DNS, api_key=creds.token)
model_response = client.chat.completions.create(
    model="your model",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)
```

**You can also make HTTP/GRPC request directly.**

POST request to the following URL:
```
curl -X POST -H "Content-Type: application/json" \
  -H "Authorization: Bearer `gcloud auth print-access-token`" \
  https://DEDICATED_DNS/v1beta1/projects/PROJECT_NUMBER/locations/LOCATION/endpoints/ENDPOINT_ID/chat/completions -d PAYLOAD
```


## Supported Features

### Traffic split

Deploy another model, and update the traffic split to be 50:50, this should take less than 5 min using the test model. After the deployment is done, you can rerun the prediction again for multiple times, you should be able to see the deployed_model_id are different.

In [None]:
endpoint.deploy(model=model, traffic_percentage=50, machine_type="e2-standard-8")

In [None]:
instances = [
    {
        "dense_input": [
            14.0,
            7.0,
            2545.461893666405,
            54.2,
            48.5,
            0.0,
            61.3,
            0.0,
            0.0,
            0.0,
        ]
    },
    {"dense_input": [28.0, 14.0, 1234, 27.1, 90, 0.0, 61.3, 0.0, 0.0, 0.0]},
]
counter = {}
for i in range(1000):
    response = endpoint.predict(instances=instances, use_dedicated_endpoint=True)
    if response.deployed_model_id in counter.keys():
        counter[response.deployed_model_id] += 1
    else:
        counter[response.deployed_model_id] = 1
print(counter)

You can update the traffic split with the following command and run the code above again.

In [None]:
deployed_model_id_0 = list(counter)[0]
deployed_model_id_1 = list(counter)[1]

endpoint.update(traffic_split={deployed_model_id_0: 20, deployed_model_id_1: 80})

### Custom Timeout

Default 600s(10 min) timeout will be applied if the inference timeout is 0. Max timeout allowed is 1h. \

Use `EndpointService.UpdateEndpointLongRunning` to change the setting \
`EndpointService.UpdateEndpoint` doesn't support request/response logging change.

```
timeout_endpoint = aiplatform.Endpoint.create(
    display_name="test-dedicated-endpoint-with-timeout",
    dedicated_endpoint_enabled=True,
    inference_timeout=1800, # Unit: Second.
)
```
Alternatively, you can create the endpoint using HTTP.

Create endpoint with timeout sample:
```
curl -X POST -H "Content-Type: application/json" \
  -H "Authorization: Bearer `gcloud auth print-access-token`" \
  https://${DEDICATED_DNS}/v1beta1/projects/${PROJECT_NUMBER}/locations/us-central1/endpoints -d \
  '{ \
      displayName: "test-dedicated-endpoint-with-timeout", \
      dedicatedEndpointEnabled: true, \
      client_connection_config: \
          { \
            "inferenceTimeout": \
              {"seconds": 20, "nanos": 0} \
          } \
    }'
```
Update endpoint long running sample:
```
curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${DEDICATED_DNS}/v1beta1/projects/${PROJECT_ID}/locations/${REGION}/endpoints/${ENDPOINT_ID}:update \
  -d '{ \
      "endpoint": { \
        "clientConnectionConfig": { \
            "inferenceTimeout": {"seconds": 5, "nanos": 0} \
          }, \
      } \
    }'
```

### Request&Response Logging

Request/Response logging will skip payload that exceed 10M which is the big query limit.

Use `EndpointService.UpdateEndpointLongRunning` to change the setting \
`EndpointService.UpdateEndpoint` doesn't support request/response logging change.

```
logging_endpoint = aiplatform.Endpoint.create(
    display_name="test-dedicated-endpoint-with-logging",
    dedicated_endpoint_enabled=True,
    enable_request_response_logging=True,
    request_response_logging_sampling_rate=1.0, # default 0.0
    request_response_logging_bq_destination_table="bq://test_logging",
    # if not set, a new table will be created with name `bq://{project_id}.logging_{endpoint_display_name}_{endpoint_id}.request_response_logging`
)
```

Alternatively, use HTTP directly.

Create endpoint sample
```
curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" \
  https://${DEDICATED_DNS}/v1beta1/projects/${PROJECT_ID}/locations/us-central1/endpoints \
  -d '{ \
    "display_name": "test-dedicated-endpoint-with-logging", \
    "dedicatedEndpointEnabled": true, \
    "predict_request_response_logging_config": { \
      "enabled": true, \
      "sampling_rate": 0.5, \
      "bigquery_destination": { \
        "output_uri": "bq://my-project" \
        } \
      } \
    }'
```

Update endpoint long running request sample
```
curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${DEDICATED_DNS}/v1beta1/projects/${PROJECT_ID}/locations/${REGION}/endpoints/${ENDPOINT_ID}:update \
  -d '{ \
      "endpoint": { \
        "predict_request_response_logging_config": { \
          "enabled": true, \
          "sampling_rate": 0.5, \
          "bigquery_destination": { \
            "output_uri": "bq://my-project" \
          } \
        }, \
      } \
    }'
```


## Cleanup

In [None]:
endpoint.undeploy_all()
endpoint.delete()
model.delete()

Delete the bucket if needed.

In [None]:
! gsutil rm -r {BUCKET_URI}

Optionally, you can use the following command to clean up all endpoint and models if needed.

In [None]:
# for e in aiplatform.Endpoint.list():
#   e.undeploy_all()
#   e.delete()