In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Llama 3.2 deployment to GKE using GPU

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_llama3_2_deployment_on_gke.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_llama3_2_deployment_on_gke.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview
This notebook demonstrates downloading, deploying, and serving prebuilt Llama 3.2 models on GPU Using GKE. The models uses Virtual Large Language Model [vLLM](https://developers.googleblog.com/en/inference-with-gemma-using-dataflow-and-vllm/#:~:text=model%20frameworks%20simple.-,What%20is%20vLLM%3F,-vLLM%20is%20an) inference server.



### Objective

Deploy and run inference for serving Llama 3.2 on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

- Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

- Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 3. **[Optional]** Set `CLUSTER_NAME` if you want to use your own GKE cluster. If not set, this example will create a auto-pilot cluster in the specified project.
import datetime
import os

# Get the default cloud project id.
default_project = os.environ["GOOGLE_CLOUD_PROJECT"]
PROJECT_ID = default_project  # @param {type:"string"}
assert PROJECT_ID

# Get the default region for launching jobs.

REGION = "us-central1"  # @param ["us-central1", "us-west1", "us-east4"]

# Set up gcloud.
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

# The cluster name to create
CLUSTER_NAME = ""  # @param {type:"string"}

# Use existing GKE cluster or create a new cluster.
if CLUSTER_NAME:
    ! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}
else:
    now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    CLUSTER_NAME = f"gke-cluster-{now}"
    # create auto-pilot cluster
    !gcloud container clusters create-auto {CLUSTER_NAME} --location={REGION} --project={PROJECT_ID}

In [None]:
# @title Deploy

# @markdown This section deploys llama 3.2 on GKE.

# @markdown The model deployment takes about 5 to 15 minutes to complete. Larger models may take longer.

# @markdown Select the model to deploy:
MODEL_NAME = "Llama-3-2-11B-Vision"  # @param ['Llama-Guard-3-1B', 'Llama-Guard-3-11B-Vision', 'Llama-3-2-11B-Vision', 'Llama-3-2-11B-Vision-Instruct', 'Llama-3-2-3B', 'Llama-3-2-3B-Instruct', 'Llama-3-2-1B', 'Llama-3-2-1B-Instruct']
ARGS_TEMPLATE = """args:
        - python
        - -m
        - vllm.entrypoints.api_server
        - --host 0.0.0.0
        - --port 7080
        - --model=gs://vertex-model-garden-public-us/llama3.2/{}
        - --tensor-parallel-size {}
        - --swap-space 16
        - --gpu-memory-utilization 0.95
        {}
        - --max-num-seqs {}
        {}
        - --enable-auto-tool-choice
        {}
        - --disable-log-stats
        {}"""
# Model_name, tensor_size, - --model-type=llama3.1, max_seqs, - --enforce-eager, - --limit_mm_per_prompt='image=1', - --max-model-len 8192


def generate_args(missing_args):
    args = ARGS_TEMPLATE.format(
        missing_args[0],
        missing_args[1],
        missing_args[2],
        missing_args[3],
        missing_args[4],
        missing_args[5],
        missing_args[6],
    )
    lines = args.splitlines()
    non_empty_lines = [line for line in lines if line.strip()]
    return "\n".join(non_empty_lines)


attr = {
    "Llama-Guard-3-1B": [
        [
            "Llama-Guard-3-1B",
            "1",
            "",
            "12",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        8,
        "29Gi",
        "80Gi",
        1,
    ],
    "Llama-Guard-3-11B-Vision": [
        [
            "Llama-Guard-3-11B-Vision",
            "2",
            "- --model-type=llama3.1",
            "12",
            "- --enforce-eager",
            "- --limit_mm_per_prompt='image=1'",
            "- --max-model-len 8192",
        ],
        15,
        "58Gi",
        "120Gi",
        2,
    ],
    "Llama-3-2-11B-Vision": [
        [
            "Llama-3.2-11B-Vision",
            "2",
            "- --tool-call-parser=vertex-llama-3",
            "12",
            "- --enforce-eager",
            "- --limit_mm_per_prompt='image=1'",
            "- --max-model-len 8192",
        ],
        15,
        "58Gi",
        "120Gi",
        2,
    ],
    "Llama-3-2-11B-Vision-Instruct": [
        [
            "Llama-3.2-11B-Vision-Instruct",
            "2",
            "- --tool-call-parser=vertex-llama-3",
            "12",
            "- --enforce-eager",
            "- --limit_mm_per_prompt='image=1'",
            "- --max-model-len 8192",
        ],
        15,
        "58Gi",
        "120Gi",
        2,
    ],
    "Llama-3-2-3B": [
        [
            "Llama-3.2-3B",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        10,
        "39Gi",
        "100Gi",
        1,
    ],
    "Llama-3-2-3B-Instruct": [
        [
            "Llama-3.2-3B-Instruct",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        10,
        "39Gi",
        "100Gi",
        1,
    ],
    "Llama-3-2-1B": [
        [
            "Llama-3.2-1B",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        8,
        "29Gi",
        "80Gi",
        1,
    ],
    "Llama-3-2-1B-Instruct": [
        [
            "Llama-3.2-1B-Instruct",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        8,
        "29Gi",
        "80Gi",
        1,
    ],
}

model_attr = attr[MODEL_NAME]
ARGS = generate_args(model_attr[0])
CPU_LIMITS = model_attr[1]
MEMORY_SIZE = model_attr[2]
EPHEMERAL_STORAGE_SIZE = model_attr[3]
GPU_COUNT = model_attr[4]


K8S_YAML = f"""
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-server
  template:
    metadata:
      labels:
        app: llama-server
        ai.gke.io/model: {MODEL_NAME}
        ai.gke.io/inference-server: vllm
        examples.ai.gke.io/source: model-garden
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241007_2233_RC00
        resources:
          requests:
            cpu: {CPU_LIMITS}
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
          limits:
            cpu: {CPU_LIMITS}
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
        {ARGS}
        env:
        - name: MODEL_ID
          value: 'meta-llama/{MODEL_NAME}'
        - name: DEPLOY_SOURCE
          value: 'UI_NATIVE_MODEL'
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
  name: llama-service
spec:
  selector:
    app: llama-server
  type: ClusterIP
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 7080
"""

with open("llama_32.yaml", "w") as f:
    f.write(K8S_YAML)

! kubectl apply -f llama_32.yaml

# Wait for container to be created.
import time

MAX_WAIT_TIME = 600  # 10 minutes in seconds
start_time = time.time()
end_time = time.time() + MAX_WAIT_TIME

print("Waiting for container to be created...\n")
while start_time < end_time:
    shell_output = ! kubectl get pod -l app=llama-server
    container_status = "\n".join(shell_output)
    if "1/1" in container_status:
        break
    time.sleep(15)
    start_time += 15

if start_time > end_time:
    print("Deployment took longer than expected")

print(container_status)

# Wait for downloading artifacts.
start_time = time.time()
end_time = time.time() + MAX_WAIT_TIME
print("\nDownloading artifacts...")
while start_time < end_time:
    shell_output = ! kubectl logs -l app=llama-server
    logs = "\n".join(shell_output)
    if "Connected" in logs or "Uvicorn running" in logs:
        break
    time.sleep(15)
    start_time += 15

if start_time > end_time:
    print("Deployment took longer than expected")


print("\nServer is up and running!")

In [None]:
# @title Chat completion for text-only models

# @markdown Once the server is up and running, you may send prompts to local server for prediction.

import json

user_message = "What is AI?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 0.9  # @param {type:"number"}

# Overrides max_tokens and top_k parameters during inferences.
request = {
    "prompt": user_message,
    "max_tokens": max_tokens,
    "temperature": temperature,
}
get_pod = ! kubectl get pod -l app=llama-server -o jsonpath="{{.items[0].metadata.name}}"
pod_name = get_pod[0]

exec_command = f"""kubectl exec -t {pod_name} -- curl -X POST http://localhost:7080/generate \
   -H "Content-Type: application/json" \
   -d '{json.dumps(request)}' \
   2> /dev/null"""

response = !{exec_command}
# print(response)
# @markdown Response:
print(json.loads(response[0])["predictions"][0])

In [None]:
# @title Chat completion for vision models

# @markdown Next fill out some request parameters:

user_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/580px-The_Blue_Marble_%28remastered%29.jpg"  # @param {type: "string"}
user_message = "What is in the image?"  # @param {type: "string"}
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}

# @markdown Now we can send a request.

prompt = f"""image_url: {user_image} user_prompt:{user_message}"""

# Overrides max_tokens and top_k parameters during inferences.
request = {
    "prompt": prompt,
    "max_tokens": max_tokens,
    "temperature": temperature,
}

get_pod = ! kubectl get pod -l app=llama-server -o jsonpath="{{.items[0].metadata.name}}"
pod_name = get_pod[0]

exec_command = f"""kubectl exec -t {pod_name} -- curl -X POST http://localhost:7080/generate \
   -H "Content-Type: application/json" \
   -d '{json.dumps(request)}' \
   2> /dev/null"""

response = !{exec_command}
print(json.loads(response[0])["predictions"][0])

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.
DELETE_DEPLOYMENT = False # @param {type: "boolean"}
DELETE_CLUSTER = False # @param {type: "boolean"}

if DELETE_CLUSTER or DELETE_DEPLOYMENT:
  ! kubectl delete deployments llama-deployment
  ! kubectl delete services llama-service

if DELETE_CLUSTER:
  ! gcloud container clusters delete {CLUSTER_NAME} \
    --region={REGION} \
    --quiet