In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Mistral models deployment to GKE using GPU

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_tgi_mistral_deployment_on_gke.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_tgi_mistral_deployment_on_gke.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading and deploying Mistral, open models from Google DeepMind using Text Generation Inference [TGI](https://github.com/), an efficient serving option to improve serving throughput. In this notebook we will deploy and serve TGI on GPUs. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.


### Objective

Deploy and run inference for serving Mistral with TGI on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)


### TGI

TGI is a highly optimized open-source LLM serving framework that can increase serving throughput on GPUs. TGI includes features such as:

Optimized transformer implementation with PagedAttention
Continuous batching to improve the overall serving throughput
Tensor parallelism and distributed serving on multiple GPUs

To learn more, refer to the [TGI documentation](https://github.com/huggingface/text-generation-inference/blob/main/README.md)

## Run the notebook

In [None]:
# @title Setup Google Cloud project


# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. Set Hugging Face access token in `HF_TOKEN` field. If you don't already have a "read" access token, follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create an access token with "read" permission. You can find your existing access tokens in the Hugging Face [Access Token](https://huggingface.co/settings/tokens) page.

# @markdown 3. **[Optional]** Set `CLUSTER_NAME` if you want to use your own GKE cluster. If not set, this example will create a standard cluster with 2 NVIDIA L4 GPU accelerators.

# @markdown 3. **[Optional]** Set `PROJECT_ID` if you have a specific GCP project you want to use.
import datetime
import os

# The HuggingFace token used to download models.
HF_TOKEN = ""  # @param {type:"string"}
assert HF_TOKEN, "Set Hugging Face access token in `HF_TOKEN`."

# The cluster name to create
CLUSTER_NAME = ""  # @param {type:"string"}

# Get the default cloud project id.
PROJECT_ID = ""  # @param {type:"string"}
if not PROJECT_ID:
    PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = "us-central1"  # @param ["us-central1", "us-west1", "us-east4"]

# Set up gcloud.
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

# Use existing GKE cluster or create a new cluster.
if CLUSTER_NAME:
  ! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}
else:
  now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  CLUSTER_NAME = f"gke-cluster-{now}"
  # create auto-pilot cluster
  !gcloud container clusters create-auto {CLUSTER_NAME} --location={REGION} --project={PROJECT_ID}

# Create Kubernetes secret for Hugging Face credentials
! kubectl create secret generic hf-secret \
    --from-literal=hf_api_token={HF_TOKEN} \
    --dry-run=client -o yaml > hf-secret.yaml

! kubectl apply -f hf-secret.yaml

In [None]:
# @title Deploy

# @markdown This section deploys Mistral models on GKE.

# @markdown The model deployment takes about 5 to 15 minutes to complete.

# @markdown Select the model to deploy:
MODEL_NAME = "Mistral-7B-v0.1"  # @param ['Mistral-7B-v0.1','Mistral-7B-Instruct-v0.2']


MISTRAL_YAML = f"""
apiVersion: apps/v1
kind: Deployment
metadata:
  name: mistral-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: mistral-server
  template:
    metadata:
      labels:
        app: mistral-server
        ai.gke.io/model: {MODEL_NAME}
        ai.gke.io/inference-server: text-generation-inference
        examples.ai.gke.io/source: model-garden
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310
        resources:
          requests:
            cpu: 8
            memory: 29Gi
            ephemeral-storage: 80Gi
            nvidia.com/gpu : 1
          limits:
            cpu: 8
            memory: 29Gi
            ephemeral-storage: 80Gi
            nvidia.com/gpu : 1
        command:
        args:
        - --model-id=mistralai/{MODEL_NAME}
        - --cuda-memory-fraction=0.9
        env:
        - name: DEPLOY_SOURCE
          value: UI_HF_VERIFIED_MODEL
        - name: MAX_INPUT_LENGTH
          value: "512"
        - name: MAX_TOTAL_TOKENS
          value: "1024"
        - name: MAX_BATCH_PREFILL_TOKENS
          value: "2048"
        - name: TRUST_REMOTE_CODE
          value: "true"
        - name: MODEL_ID
          value: "mistralai/{MODEL_NAME}"
        - name: NUM_SHARD
          value: "1"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-secret
              key: hf_api_token
        - name: "AIP_HTTP_PORT"
          value: "80"
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4

---
apiVersion: v1
kind: Service
metadata:
  name: mistral-service
spec:
  selector:
    app: mistral-server
  type: ClusterIP
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 80
"""

with open("mistral_tgi.yaml", "w") as f:
    f.write(MISTRAL_YAML)
! kubectl apply -f mistral_tgi.yaml

import subprocess
import time


def wait_for_status(
    command, expected_output, timeout_seconds=600, poll_interval=15, message=""
):
    """Waits for a command's output to contain an expected string."""

    start_time = time.time()
    end_time = start_time + timeout_seconds

    print(f"{message}...")
    while time.time() < end_time:
        try:
            output = subprocess.check_output(command, text=True)
            if expected_output in output:
                print("Done!\n")
                return
        except subprocess.CalledProcessError:
            pass  # Ignore errors and continue polling
        time.sleep(poll_interval)
    print("Timeout!\n")


# wait for container to be running
wait_for_status(
    ["kubectl", "get", "pod", "-l", "app=mistral-server"],
    "1/1",
    message="Waiting for container to be created",
)

wait_for_status(
    ["kubectl", "logs", "-l", "app=mistral-server"],
    "Connected",
    message="Downloading artifacts (checking for 'Connected')",
)

In [None]:
# @title Chat completion for text-only models

# @markdown Once the server is up and running, you may send prompts to local server for prediction.

import json

prompt = "What is AI?"  # @param {type: "string"}
temperature = 0.40  # @param {type: "number"}
top_p = 0.1  # @param {type: "number"}
max_tokens = 250  # @param {type: "number"}

request = {
    "inputs": prompt,
    "temperature": temperature,
    "top_p": top_p,
    "max_tokens": max_tokens,
}

get_pod = ! kubectl get pod -l app=mistral-server -o jsonpath="{{.items[0].metadata.name}}"
pod_name = get_pod[0]

exec_command = f"""kubectl exec -t {pod_name} -- curl -X POST http://localhost:80/generate \
   -H "Content-Type: application/json" \
   -d '{json.dumps(request)}' \
   2> /dev/null"""

response = !{exec_command}
# @markdown Response:
print(json.loads(response[0])["generated_text"])

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.
DELETE_DEPLOYMENT = False # @param {type: "boolean"}
DELETE_CLUSTER = False # @param {type: "boolean"}

if DELETE_CLUSTER or DELETE_DEPLOYMENT:
  ! kubectl delete deployment mistral-deployment
  ! kubectl delete service mistral-service

if DELETE_CLUSTER:
  ! gcloud container clusters delete {CLUSTER_NAME} \
    --region={REGION} \
    --quiet