In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Grok-1 Model (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_grok1_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_grok1_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview
This notebook deploys Grok-1 model on Vertex AI. Grok-1 is a 314 billion parameter Mixture-of-Experts model trained by xAI. 

### Objective

- Deploy Grok-1 on GPU with Vertex AI

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. This notebook uses 8 `NVIDIA_H100_80GB` or `NVIDIA_A100_80GB` GPUs with a `a3-highgpu-8g` or `a2-ultragpu-8g` machine respectively. [Make a shared reservation](https://cloud.google.com/compute/docs/instances/reservations-shared) for the required amount of GPUs in the desired machine type.

# @markdown 3. Use your A2 and A3 VM reservations with Vertex Model Garden. To access this capability, please contact your account team.

import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform  # Get the default cloud project id.

PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

In [None]:
# @markdown ### Access Grok-1 Models

# The pre-built serving docker images for Grok-1
GROK1_SERVING_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/jax-grok-serve-gpu:20240404_1814_RC00"


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering deployment jobs."""
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_grok1_model(
    model_name: str,
    machine_type: str = "a3-highgpu-8g",
    accelerator_type: str = "NVIDIA_H100_80GB",
    accelerator_count: int = 8,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    env_vars = {
        "MODEL_ID": "grok-1-314b",
    }

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=GROK1_SERVING_DOCKER_URI,
        serving_container_ports=[8080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(70 * 1024),  # 70 GB
        serving_container_deployment_timeout=7200,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=3600,
    )
    return model, endpoint

In [None]:
# @title Deploy

# @markdown This section creates a Grok-1 model in Model Registry and deploys it to a Vertex AI Endpoint. It takes about 30 minutes to an hour to finish depending on the model and the accelerator.

# Find Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_A100_80GB"  # @param ["NVIDIA_H100_80GB", "NVIDIA_A100_80GB"]

if accelerator_type == "NVIDIA_H100_80GB":
    machine_type = "a3-highgpu-8g"
    accelerator_count = 8
elif accelerator_type == "NVIDIA_A100_80GB":
    machine_type = "a2-ultragpu-8g"
    accelerator_count = 8
else:
    raise ValueError(f"Unsupported accelerator type: {accelerator_type}")

# Larger setting of `max_tokens` increases the risk of running out of GPU
# memory with long prompts.

model, endpoint = deploy_grok1_model(
    model_name=get_job_name_with_datetime(prefix="grok1-serve"),
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

Once deployment succeeds, you can send requests to the endpoint with text prompts.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint name of
#   the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "Instruction: Separate answers with new paragraphs.\nQ1:Write me a song about solar eclipse.\nA1:"  # @param {type: "string"}
max_tokens = 100  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}

instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction.get("response"))

In [None]:
# @title Clean up resources
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.

endpoint.delete(force=True)

# Delete models.
model.delete()

# Delete Cloud Storage objects.
delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI