In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemma deployment to GKE using TGI on GPU

## Overview

This notebook demonstrates downloading and deploying Gemma, open models from Google DeepMind using Text Generation Inference [TGI](https://github.com/), an efficient serving option to improve serving throughput. In this notebook we will deploy and serve TGI on GPUs. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.


### Objective

Deploy and run inference for serving Gemma with TGI on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)


### TGI

TGI is a highly optimized open-source LLM serving framework that can increase serving throughput on GPUs. TGI includes features such as:

Optimized transformer implementation with PagedAttention
Continuous batching to improve the overall serving throughput
Tensor parallelism and distributed serving on multiple GPUs

To learn more, refer to the [TGI documentation](https://github.com/huggingface/text-generation-inference/blob/main/README.md)


## Before you begin

### Configure Environment

Set the following variables for the experiment environment.

In [22]:
# The HuggingFace token used to download models.
HF_TOKEN = ""  # @param {type:"string"}

# The size of the model to launch
MODEL_SIZE = "2b"  # @param ["2b", "7b"]

# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# Region for launching clusters.
REGION = ""  # @param {type:"string"}

# The cluster name to create
CLUSTER_NAME = "gke-gemma-cluster"  # @param {type:"string"}

# The number of GPUs to run: 1 for 2b, 2 for 7b
GPU_COUNT = 1
if MODEL_SIZE == "7b":
    GPU_COUNT = 2

# Ephemeral storage
EPHEMERAL_STORAGE_SIZE = "20Gi"
if MODEL_SIZE == "7b":
    EPHEMERAL_STORAGE_SIZE = "40Gi"

# Memory size
MEMORY_SIZE = "7Gi"
if MODEL_SIZE == "7b":
    MEMORY_SIZE = "25Gi"

GPU_SHARD = 1
if MODEL_SIZE == "7b":
    GPU_SHARD = 2

CPU_LIMITS = 2
if MODEL_SIZE == "7b":
    CPU_LIMITS = 10

In [None]:
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

### Create a GKE cluster and a node pool

GKE creates the following resources for the model based on the MODEL_SIZE environment variable set above.

- Standard cluster
- 1 or 2 NVIDIA L4 GPU accelerators depending on whether you are deploying Gemma 2b or Gemma 7b respectively.

If you already have a cluster, you can skip to `Use an existing GKE cluster` instead.

In [None]:
! gcloud container clusters create {CLUSTER_NAME} \
  --project={PROJECT_ID} \
  --region={REGION} \
  --workload-pool={PROJECT_ID}.svc.id.goog \
  --release-channel=rapid \
  --num-nodes=4

In [None]:
! gcloud container node-pools create gpupool \
  --accelerator type=nvidia-l4,count=2,gpu-driver-version=latest \
  --project={PROJECT_ID} \
  --location={REGION} \
  --node-locations={REGION}-a \
  --cluster={CLUSTER_NAME} \
  --machine-type=g2-standard-24 \
  --num-nodes=1

### Use an existing GKE cluster

In [None]:
! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}

### Create Kubernetes secret for Hugging Face credentials

Create a Kubernetes Secret that contains the Hugging Face token.

In [None]:
! kubectl create secret generic hf-secret \
    --from-literal=hf_api_token={HF_TOKEN} \
    --dry-run=client -o yaml > hf-secret.yaml

! kubectl apply -f hf-secret.yaml

### Deploy TGI

Use the YAML to deploy Gemma on TGI

In [None]:
K8S_YAML = f"""
apiVersion: apps/v1
kind: Deployment
metadata:
  name: tgi-gemma-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: gemma-server
  template:
    metadata:
      labels:
        app: gemma-server
        ai.gke.io/model: gemma-{MODEL_SIZE}
        ai.gke.io/inference-server: text-generation-inference
        examples.ai.gke.io/source: user-guide
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
        resources:
          requests:
            cpu: "2"
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
          limits:
            cpu: {CPU_LIMITS}
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
        args:
        - --model-id=$(MODEL_ID)
        - --num-shard={GPU_SHARD}
        env:
        - name: MODEL_ID
          value: google/gemma-{MODEL_SIZE}-it
        - name: PORT
          value: "8000"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-secret
              key: hf_api_token
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
  name: llm-service
spec:
  selector:
    app: gemma-server
  type: ClusterIP
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 8000
"""

with open("tgi.yaml", "w") as f:
    f.write(K8S_YAML)

! kubectl apply -f tgi.yaml

#### Waiting for the container to create

Use the command below to check on the status of the container.

In [None]:
! kubectl get pod

#### View the logs from the running deployment

##### This will download the needed artifacts, this process will take close to 5 minutes depending on what runtime you are using to run your colab environment. The server is up and running and ready to take inference request once you see log messages like these :

```
INFO text_generation_router: router/src/main.rs:237: Using the Hugging Face API to retrieve tokenizer config
INFO text_generation_router: router/src/main.rs:280: Warming up model
INFO text_generation_router: router/src/main.rs:316: Setting max batch total tokens to 666672
INFO text_generation_router: router/src/main.rs:317: Connected
```

In [None]:
! kubectl logs -f -l app=gemma-server

#### Set up port forwarding

In [None]:
! kubectl exec -t $( kubectl get pod -l app=gemma-server -o jsonpath="{.items[0].metadata.name}" ) -c inference-server -- curl -X POST http://localhost:8000/generate \
   -H "Content-Type: application/json" \
   -d '{ "inputs": "What are the top 5 most popular programming languages? Please be brief.", "temperature": 0.40, "top_p": 0.1, "max_tokens": 250 }' \
   2> /dev/null

## Clean up resources

In [None]:
! kubectl delete deployments tgi-gemma-deployment
! kubectl delete services llm-service
! kubectl delete secrets hf-secret

In [None]:
! gcloud container clusters delete {CLUSTER_NAME} \
  --region={REGION} \
  --quiet