In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploying Llama 3 on Google Kubernetes Engine with Cloud Functions and vLLM

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fuse-cases%2Fdeploy_llama3_vllm_gke_cloud_function.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/deploy_llama3_vllm_gke_cloud_function.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [KC Ayyagari](https://github.com/krishchyt) |

This notebook demonstrates how to deploy and serve Llama 3 models on Google Kubernetes Engine (GKE) using GPUs, and how to integrate this deployment with a Cloud Function to create an accessible API endpoint. This uses Virtual Large Language Model [vLLM](https://developers.googleblog.com/en/inference-with-gemma-using-dataflow-and-vllm/#:~:text=model%20frameworks%20simple.-,What%20is%20vLLM%3F,-vLLM%20is%20an) inference server.

## Objective

*   Deploy and run inference for serving Llama 3 on GPUs.
*   Create a Cloud Function.
*   Call Llama from the Cloud Function.

![Workflow diagram](https://github.com/krishchyt/kc-demoassets/blob/main/gke-cf-workflow.png?raw=true)

## Key Steps

### Setup

*   Authenticates with Google Cloud.
*   Sets up the Google Cloud project, region, network, and subnet configurations.
*   Enables the necessary Google Cloud services (Container Registry, VPC Access).
*   Installs `kubectl`.

### GKE Cluster Creation

*   Creates an Autopilot GKE cluster with specified configurations (region, network, private nodes, etc.).

### Llama 3 Deployment on GKE

*   Defines Kubernetes deployment and service configurations for deploying a selected Llama 3 model on GKE using vLLM.
*   Allows the user to select from a range of Llama 3 models (Llama-3-2-1B-Instruct, Llama-3-2-11B-Vision, etc.).
*   Creates a Kubernetes YAML file based on the selected model and applies it to the cluster.
*   Waits for the container to be created and the server to be up and running.

### Endpoint Testing

*   Retrieves the Internal Load Balancer IP address for the Llama service.
*   Sends a test request to the Llama 3 endpoint and prints the response.

### Cloud Function Integration

*   Creates a service account with the necessary permissions.
*   Prepares the code for a Cloud Function that will act as an API endpoint for the Llama 3 model.
*   Creates a VPC connector to allow the Cloud Function to access the GKE cluster.
*   Deploys the Cloud Function, configuring environment variables (including the Llama endpoint IP) and other settings.

### Cleanup

*   Provides options to delete the deployment, cluster, Cloud Function, and VPC connector to avoid unnecessary charges.

## Use Cases

This setup enables various use cases, including:

*   **Chatbots/Conversational AI:** Building a chatbot that can answer user questions, provide information, or engage in conversations. The Cloud Function acts as the API endpoint, receiving user input and passing it to the Llama 3 model running on GKE for generating responses.
*   **Content Generation:** Generating different kinds of creative text formats, like poems, code, scripts, musical pieces, email, letters, etc. The Cloud Function receives a prompt and parameters (e.g., length, style) and uses Llama 3 to generate the content.
*   **Question Answering:** Providing answers to specific questions based on a knowledge base. The Cloud Function receives the question, and Llama 3 extracts the relevant information and formulates an answer.
*   **Code Generation/Completion:** Assisting developers by generating code snippets or completing code based on context. The Cloud Function receives the code context and uses Llama 3 to suggest code completions or generate entire functions.
*   **Image Captioning and Visual Question Answering (if using a vision-enabled model):** If you're using a Llama 3 model with vision capabilities, you can use this setup for image captioning (generating descriptions of images) or visual question answering (answering questions about the content of an image).
*   **Data Analysis and Insights:** Use Llama 3 to analyze text data and extract insights. The Cloud Function can receive text data, pass it to Llama 3 for analysis, and then return the insights.
*   **Workflow Automation:** Integrate Llama 3 into automated workflows to perform tasks such as summarizing documents, extracting key information, or translating text.


## Benefits

*   **Scalability:** GKE allows you to scale the Llama 3 deployment based on demand.
*   **Cost-Effectiveness:** Cloud Functions are serverless and only charged when used.
*   **Flexibility:** You can easily update the Llama 3 model or the Cloud Function code.

## GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

- Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

- Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)

In [None]:
%pip install requests>=2.26.0 functions-framework>=3.0.0 google-cloud-logging>=3.0.0

In [None]:
! gcloud auth application-default login

## Setup Google Cloud project


In [1]:
# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 3. **[Optional]** Set `CLUSTER_NAME` if you want to use your own GKE cluster. If not set, this example will create a auto-pilot cluster in the specified project.
import datetime

# Get the default region and project for launching jobs.

REGION = "us-central1"  # @param ["us-central1", "us-west1", "us-east4"]
PROJECT_ID = ""  # @param {type:"string"}
assert PROJECT_ID

# Enter sure network is in same region
NETWORK_NAME = ""  # @param {type:"string"}

# Make sure the subnet is in same region
SUBNET_NAME = ""  # @param {type:"string"}
NETWORK_URI = f"projects/{PROJECT_ID}/global/networks/{NETWORK_NAME}"
SUBNET_URI = f"projects/{PROJECT_ID}/regions/{REGION}/subnetworks/{SUBNET_NAME}"

# Set up gcloud.
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

# The cluster name to create
CLUSTER_NAME = ""  # @param {type:"string"}

now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

CLUSTER_NAME = f"{CLUSTER_NAME}-{now}"

Updated property [core/project].

All components are up to date.


## Set up GKE

### Create auto pilot cluster

In [None]:
# create auto-pilot cluster
!gcloud beta container --project {PROJECT_ID} clusters create-auto {CLUSTER_NAME} --region {REGION} --release-channel "regular" --tier "standard" --enable-private-nodes --enable-ip-access --no-enable-google-cloud-access --network {NETWORK_URI} --subnetwork {SUBNET_URI} --cluster-ipv4-cidr "/17" --binauthz-evaluation-mode=DISABLED

### Deploy Llama on GKE

In [None]:
# @markdown This section deploys llama 3.2 on GKE.

# @markdown The model deployment takes about 5 to 15 minutes to complete. Larger models may take longer.

# @markdown Select the model to deploy:
MODEL_NAME = "Llama-3-2-1B-Instruct"  # @param [ 'Llama-3-2-1B-Instruct', 'Llama-3-2-11B-Vision', 'Llama-3-2-11B-Vision-Instruct', 'Llama-3-2-3B', 'Llama-3-2-3B-Instruct', 'Llama-3-2-1B']
ARGS_TEMPLATE = """args:
        - python
        - -m
        - vllm.entrypoints.api_server
        - --host 0.0.0.0
        - --port 7080
        - --model=gs://vertex-model-garden-public-us/llama3.2/{}
        - --tensor-parallel-size {}
        - --swap-space 16
        - --gpu-memory-utilization 0.95
        {}
        - --max-num-seqs {}
        {}
        - --enable-auto-tool-choice
        {}
        - --disable-log-stats
        {}"""
# Model_name, tensor_size, - --model-type=llama3.1, max_seqs, - --enforce-eager, - --limit_mm_per_prompt='image=1', - --max-model-len 8192


def generate_args(missing_args):
    args = ARGS_TEMPLATE.format(
        missing_args[0],
        missing_args[1],
        missing_args[2],
        missing_args[3],
        missing_args[4],
        missing_args[5],
        missing_args[6],
    )
    lines = args.splitlines()
    non_empty_lines = [line for line in lines if line.strip()]
    return "\n".join(non_empty_lines)


attr = {
    "Llama-3-2-11B-Vision": [
        [
            "Llama-3.2-11B-Vision",
            "2",
            "- --tool-call-parser=vertex-llama-3",
            "12",
            "- --enforce-eager",
            "- --limit_mm_per_prompt='image=1'",
            "- --max-model-len 8192",
        ],
        15,
        "58Gi",
        "120Gi",
        2,
    ],
    "Llama-3-2-11B-Vision-Instruct": [
        [
            "Llama-3.2-11B-Vision-Instruct",
            "2",
            "- --tool-call-parser=vertex-llama-3",
            "12",
            "- --enforce-eager",
            "- --limit_mm_per_prompt='image=1'",
            "- --max-model-len 8192",
        ],
        15,
        "58Gi",
        "120Gi",
        2,
    ],
    "Llama-3-2-3B": [
        [
            "Llama-3.2-3B",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        10,
        "39Gi",
        "100Gi",
        1,
    ],
    "Llama-3-2-3B-Instruct": [
        [
            "Llama-3.2-3B-Instruct",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        10,
        "39Gi",
        "100Gi",
        1,
    ],
    "Llama-3-2-1B": [
        [
            "Llama-3.2-1B",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        8,
        "29Gi",
        "80Gi",
        1,
    ],
    "Llama-3-2-1B-Instruct": [
        [
            "Llama-3.2-1B-Instruct",
            "1",
            "",
            "64",
            "",
            "",
            "- --tool-call-parser=vertex-llama-3",
        ],
        8,
        "29Gi",
        "80Gi",
        1,
    ],
}

model_attr = attr[MODEL_NAME]
ARGS = generate_args(model_attr[0])
CPU_LIMITS = model_attr[1]
MEMORY_SIZE = model_attr[2]
EPHEMERAL_STORAGE_SIZE = model_attr[3]
GPU_COUNT = model_attr[4]


K8S_YAML = f"""
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-server
  template:
    metadata:
      labels:
        app: llama-server
        ai.gke.io/model: {MODEL_NAME}
        ai.gke.io/inference-server: vllm
        examples.ai.gke.io/source: model-garden
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241007_2233_RC00
        resources:
          requests:
            cpu: {CPU_LIMITS}
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
          limits:
            cpu: {CPU_LIMITS}
            memory: {MEMORY_SIZE}
            ephemeral-storage: {EPHEMERAL_STORAGE_SIZE}
            nvidia.com/gpu: {GPU_COUNT}
        {ARGS}
        env:
        - name: MODEL_ID
          value: 'meta-llama/{MODEL_NAME}'
        - name: DEPLOY_SOURCE
          value: 'UI_NATIVE_MODEL'
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
  name: llama-service
  annotations:
    cloud.google.com/load-balancer-type: "Internal"
spec:
  selector:
    app: llama-server # Should match the labels on your Pods
  type: LoadBalancer
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 7080
"""

with open("llama_32.yaml", "w") as f:
    f.write(K8S_YAML)

! kubectl apply -f llama_32.yaml

### Wait for container to be created.


In [None]:
import time

MAX_WAIT_TIME = 600  # 10 minutes in seconds
start_time = time.time()
end_time = time.time() + MAX_WAIT_TIME

print("Waiting for container to be created...\n")
while start_time < end_time:
    shell_output = ! kubectl get pod -l app=llama-server
    container_status = "\n".join(shell_output)
    if "1/1" in container_status:
        break
    time.sleep(15)
    start_time += 15

if start_time > end_time:
    print("Deployment took longer than expected")

print(container_status)

# Wait for downloading artifacts.
start_time = time.time()
end_time = time.time() + MAX_WAIT_TIME
print("\nDownloading artifacts...")
while start_time < end_time:
    shell_output = ! kubectl logs -l app=llama-server
    logs = "\n".join(shell_output)
    if "Connected" in logs or "Uvicorn running" in logs:
        break
    time.sleep(15)
    start_time += 15

if start_time > end_time:
    print("Deployment took longer than expected")


print("\nServer is up and running!")

### Testing endpoint

In [None]:
import json
import subprocess
import sys
import time

import requests  # Required: pip install requests

# --- Configuration ---
SERVICE_NAME = "llama-service"
NAMESPACE = "default"  # Adjust if your service is in a different namespace
SERVICE_PORT = 8000  # The 'port' specified in your Service YAML
API_PATH = "/generate"  # The specific API path for vLLM generation

# --- Request Payload ---
user_message = "What is AI?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 0.9  # @param {type:"number"}

request_payload = {
    "prompt": user_message,
    "max_tokens": max_tokens,
    "temperature": temperature,
}


# --- Function to Get Internal Load Balancer IP ---
def get_internal_lb_ip(service_name, namespace, retries=5, delay=10):
    """Fetches the Internal Load Balancer IP for a Kubernetes Service."""
    command = [
        "kubectl",
        "get",
        "service",
        service_name,
        "-n",
        namespace,
        "-o",
        "jsonpath={.status.loadBalancer.ingress[0].ip}",
    ]
    print(
        f"Attempting to get IP for service '{service_name}' in namespace '{namespace}'..."
    )
    for attempt in range(retries):
        try:
            result = subprocess.run(
                command, capture_output=True, text=True, check=True, timeout=30
            )
            ip_address = result.stdout.strip()
            if ip_address:
                print(f"Successfully found Internal LB IP: {ip_address}")
                return ip_address
            else:
                print(
                    f"Attempt {attempt + 1}/{retries}: IP address not assigned yet. Waiting {delay} seconds..."
                )
        except subprocess.CalledProcessError as e:
            # Handle cases where the service might not exist or JSON path fails temporarily
            print(
                f"Attempt {attempt + 1}/{retries}: Error getting service IP: {e}. stderr: {e.stderr}. Waiting {delay} seconds..."
            )
        except subprocess.TimeoutExpired:
            print(
                f"Attempt {attempt + 1}/{retries}: 'kubectl' command timed out. Waiting {delay} seconds..."
            )
        except FileNotFoundError:
            print(
                "Error: 'kubectl' command not found. Please ensure kubectl is installed and in your PATH."
            )
            return None

        if attempt < retries - 1:
            time.sleep(delay)

    print(
        f"Error: Could not retrieve Internal LB IP for service '{service_name}' after {retries} attempts."
    )
    return None


# --- Main Execution ---
internal_ip = get_internal_lb_ip(SERVICE_NAME, NAMESPACE)

if internal_ip:
    # Construct the full endpoint URL
    endpoint_url = f"http://{internal_ip}:{SERVICE_PORT}{API_PATH}"
    print(f"Calling endpoint: {endpoint_url}")

    headers = {"Content-Type": "application/json"}

    try:
        # Make the POST request
        response = requests.post(
            endpoint_url,
            headers=headers,
            json=request_payload,  # requests library handles json serialization
            timeout=120,  # Set a reasonable timeout (in seconds)
        )

        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the JSON response
        response_data = response.json()

        # Extract and print the prediction (adjust path if vLLM format differs)
        # Assuming the structure is similar to the output you got via kubectl exec
        if (
            "predictions" in response_data
            and isinstance(response_data["predictions"], list)
            and len(response_data["predictions"]) > 0
        ):
            print("\n--- Response ---")
            print(response_data["predictions"][0])
        elif (
            "text" in response_data
            and isinstance(response_data["text"], list)
            and len(response_data["text"]) > 0
        ):
            # Handle potential alternative vLLM output format like {'text': ['...']}
            print("\n--- Response ---")
            print(response_data["text"][0])
        else:
            print(
                "\n--- Full Response (Could not find 'predictions' or 'text' array) ---"
            )
            print(json.dumps(response_data, indent=2))

    except requests.exceptions.ConnectionError as e:
        print(
            f"\nError: Could not connect to the endpoint {endpoint_url}.",
            file=sys.stderr,
        )
        print(
            "Check network connectivity and if the service/pods are running.",
            file=sys.stderr,
        )
        print(f"   Details: {e}", file=sys.stderr)
    except requests.exceptions.Timeout:
        print(
            f"\nError: Request timed out while calling {endpoint_url}.", file=sys.stderr
        )
    except requests.exceptions.RequestException as e:
        print(
            f"\nError: An error occurred during the request to {endpoint_url}: {e}",
            file=sys.stderr,
        )
        # Print response body if available, might contain useful error info from the server
        if e.response is not None:
            print("--- Server Response ---", file=sys.stderr)
            try:
                print(json.dumps(e.response.json(), indent=2), file=sys.stderr)
            except json.JSONDecodeError:
                print(e.response.text, file=sys.stderr)
    except json.JSONDecodeError:
        print(
            "\nError: Could not decode the JSON response from the server.",
            file=sys.stderr,
        )
        print("--- Raw Response ---", file=sys.stderr)
        print(response.text, file=sys.stderr)
else:
    print(
        "\nExecution failed: Could not determine the endpoint IP address.",
        file=sys.stderr,
    )

## Cloud Function

### Create service account and assign right permissions

In [None]:
SERVICE_ACCOUNT_NAME = ""  # @param {type: "string"}

!gcloud iam service-accounts create {SERVICE_ACCOUNT_NAME} \
  --project={PROJECT_ID} \
  --description="llama service account" \
  --display-name="llama testing service account"

SERVICE_ACCOUNT = f"{SERVICE_ACCOUNT_NAME}@{PROJECT_ID}.iam.gserviceaccount.com"

for role in ['aiplatform.user', 'storage.objectAdmin', 'artifactregistry.reader', 'run.developer', 'run.invoker']:

    ! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/{role} --condition=None

### Prepare code for Cloud function

In [None]:
from pathlib import Path as path

ROOT_PATH = path.cwd()
TUTORIAL_PATH = ROOT_PATH / "tutorial"
BUILD_PATH = TUTORIAL_PATH / "build"

TUTORIAL_PATH.mkdir(parents=True, exist_ok=True)
BUILD_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
requirements = """
requests>=2.26.0
functions-framework>=3.0.0
google-cloud-logging>=3.0.0
"""

with open(BUILD_PATH / "requirements.txt", "w") as f:
    f.write(requirements)

In [None]:
cloud_function_code = '''
import os
import json
import requests
import logging
import functions_framework # Required by GCF

# Configure logging
logging.basicConfig(level=logging.INFO)

# --- Configuration (Partially from Environment) ---
# Fetch required info from environment variables set during deployment
LLAMA_ILB_IP = os.environ.get("LLAMA_ENDPOINT_IP") # Expecting the IP here
SERVICE_PORT = os.environ.get("LLAMA_ENDPOINT_PORT", "8000") # Default to 8000 if not set
API_PATH = "/generate" # The specific API path for vLLM generation
REQUEST_TIMEOUT = 120 # Timeout for the request to the Llama service

@functions_framework.http # Defines this as an HTTP-triggered function
def call_llama_service(request):
    """
    Google Cloud Function entry point.
    Expects a POST request with JSON body containing:
    {
        "prompt": "Your question here",
        "max_tokens": 50,      // optional
        "temperature": 0.9     // optional
    }
    """
    if not LLAMA_ILB_IP:
        logging.error("Environment variable LLAMA_ENDPOINT_IP is not set.")
        return ("Internal Server Error: Service endpoint IP not configured.", 500)

    if request.method != 'POST':
        logging.warning(f"Received non-POST request method: {request.method}")
        return ('Method Not Allowed', 405)

    try:
        request_json = request.get_json(silent=True)
        if not request_json or 'prompt' not in request_json:
            logging.error("Invalid request: Missing JSON payload or 'prompt' key.")
            return ("Invalid request: Missing JSON payload or 'prompt' key.", 400)

        # --- Get Parameters from Request ---
        prompt = request_json['prompt']
        max_tokens = request_json.get('max_tokens', 50) # Default if not provided
        temperature = request_json.get('temperature', 0.9) # Default if not provided

        request_payload = {
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
        }

        # Construct the full endpoint URL
        endpoint_url = f"http://{LLAMA_ILB_IP}:{SERVICE_PORT}{API_PATH}"
        logging.info(f"Calling Llama endpoint: {endpoint_url}")
        # Log payload without potentially sensitive prompt details in production if needed
        logging.info(f"Payload (excluding prompt): max_tokens={max_tokens}, temperature={temperature}")

        headers = {"Content-Type": "application/json"}

        # --- Make the POST request ---
        response = requests.post(
            endpoint_url,
            headers=headers,
            json=request_payload,
            timeout=REQUEST_TIMEOUT
        )

        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the JSON response from Llama service
        response_data = response.json()
        logging.info("Successfully received response from Llama service.")

        # --- Process and Return Response ---
        # Extract prediction (adjust keys if needed based on actual vLLM response)
        prediction = None
        if "predictions" in response_data and isinstance(response_data["predictions"], list) and len(response_data["predictions"]) > 0:
             prediction = response_data["predictions"][0]
        elif "text" in response_data and isinstance(response_data["text"], list) and len(response_data["text"]) > 0:
             prediction = response_data["text"][0]
        else:
             logging.warning("Could not find 'predictions' or 'text' in response. Returning full response.")
             # Return the full response if the expected key isn't found
             return (response_data, 200) # Return raw JSON with 200 OK

        if prediction is not None:
            # Return only the prediction text/data
            return ({ "prediction": prediction }, 200) # Return prediction in a structured way
        else:
            logging.error("Prediction key found but content was empty/invalid.")
            return ("Error processing model response", 500)


    except requests.exceptions.ConnectionError as e:
        logging.error(f"Connection Error calling {endpoint_url}: {e}")
        return (f"Could not connect to the backend service: {e}", 503) # 503 Service Unavailable
    except requests.exceptions.Timeout:
        logging.error(f"Request timed out calling {endpoint_url}")
        return ("Backend service timed out.", 504) # 504 Gateway Timeout
    except requests.exceptions.RequestException as e:
        logging.error(f"RequestException calling {endpoint_url}: {e}")
        # Log response body if available
        error_details = str(e)
        if e.response is not None:
             logging.error(f"Backend service response status: {e.response.status_code}")
             try:
                  error_details = json.dumps(e.response.json())
                  logging.error(f"Backend service response body: {error_details}")
             except json.JSONDecodeError:
                  error_details = e.response.text
                  logging.error(f"Backend service response body (non-JSON): {error_details}")
        return (f"Error calling backend service: {error_details}", 502) # 502 Bad Gateway
    except json.JSONDecodeError:
        logging.error(f"Could not decode the JSON response from the Llama service. Raw response: {response.text}")
        return ("Invalid response format from backend service.", 502)
    except Exception as e:
        logging.exception(f"An unexpected error occurred: {e}") # Log full traceback
        return ("An internal server error occurred.", 500)
'''


with open(BUILD_PATH / "main.py", "w") as f:
    f.write(cloud_function_code)

### Create VPC connector

In [None]:
! gcloud config set project {PROJECT_ID}
! gcloud services enable vpcaccess.googleapis.com

In [None]:
VPC_CONNECTOR_NAME = "" # @param {type: "string"}
RANGE = "10.6.0.0/28" # @param {type: "string"}

!gcloud compute networks vpc-access connectors create {VPC_CONNECTOR_NAME} \
 --region {REGION} \
 --network {NETWORK_NAME} \
 --range {RANGE} \
 --min-instances 2 \
 --max-instances 5 \
 --machine-type "e2-micro"

### Create function

In [None]:
# --- Set these variables ---
FUNCTION_NAME="test-llama-gke-1" # @param {type: "string"}
ILB_IP= internal_ip # The IP address you found above

# --- Deployment Command ---
!gcloud functions deploy {FUNCTION_NAME} \
    --gen2 \
    --runtime="python311" \
    --source={str(BUILD_PATH)} \
    --region={REGION} \
    --entry-point=call_llama_service \
    --trigger-http \
    --no-allow-unauthenticated \
    --set-env-vars=LLAMA_ENDPOINT_IP={ILB_IP} \
    --quiet \
    --service-account={SERVICE_ACCOUNT} \
    --timeout=600 \
    --memory=2Gb \
    --concurrency=2 \
    --min-instances=2 \
    --project {PROJECT_ID} \
    --vpc-connector {VPC_CONNECTOR_NAME}


print(f"Function: {FUNCTION_NAME} deployed successfully.")

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.
DELETE_DEPLOYMENT = False  # @param {type: "boolean"}
DELETE_CLUSTER = False  # @param {type: "boolean"}

if DELETE_DEPLOYMENT:
    ! kubectl delete deployments llama-deployment
    ! kubectl delete services llama-service

if DELETE_CLUSTER:
    ! gcloud container clusters delete {CLUSTER_NAME} --region={REGION} --quiet

! gcloud functions delete {FUNCTION_NAME} --region={REGION} --quiet

! gcloud compute networks vpc-access connectors delete {VPC_CONNECTOR_NAME} --region {REGION} --quiet