In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving Gemma 3 with Ollama on Cloud Run

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fserving%2Fcloud_run_ollama_gemma3_inference.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_gemma3_inference.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

<img src="https://ollama.com/public/ollama.png" height="200px" alignment="center"/>
<img src="https://cloud.google.com/static/architecture/images/ac-page-icons/card_google_cloud_partner.svg" height="200px">


| | |
|-|-|
| Author(s) | [Vlad Kolesnikov](https://github.com/vladkol) |

## Overview

> [**Gemma 3**](https://ai.google.dev/gemma) is a new generation of open models developed by Google. It is a collection of lightweight, state-of-the-art open models built from the same research and technology that powers our Gemini 2.0 models. Gemma 3 comes in a range of sizes (1B, 4B, 12B and 27B), allowing you to choose the best model for your specific hardware and performance needs. Gemma 3 models are available through platforms like Google AI Studio, Kaggle, and Hugging Face.

> **[Cloud Run](https://cloud.google.com/run)**:
It's a serverless platform by Google Cloud for running containerized applications. It automatically scales and manages infrastructure, supporting various programming languages. Cloud Run now offers GPU acceleration for AI/ML workloads. With 30 seconds to the first token, Cloud Run is a perfect platform for serving lightweight models like Gemma.

> **Note:** GPU support in Cloud Run is in preview. To use the GPU feature, you must request `Total Nvidia L4 GPU allocation, per project per region` quota under Cloud Run in the [Quotas and system limits page](https://cloud.google.com/run/quotas#increase).


> **[Ollama](ollama.com)**: is an open-source tool for easily running and deploying large language models locally. It offers simple management and usage of LLMs on personal computers or servers.

This notebook showcase how to deploy [Google Gemma 3](https://developers.googleblog.com/en/introducing-gemma3) in Cloud Run, with the objective to build a simple API for chat or RAG applications.

By the end of this notebook, you will learn how to:

1. Deploy Google Gemma 3 as an OpenAI-compatible API on Cloud Run using Ollama.
2. Build a custom container with Ollama to deploy any Large Language Model (LLM) of your choice.
3. Make requests to an API hosted on Cloud Run.

## Get started

### Install Google Cloud SDK

Make sure you Google Cloud SDK is installed (try running `gcloud version`) or [install it](https://cloud.google.com/sdk/docs/install) before executing this notebook.

> If you are running in Colab or Vertex AI workbench, you have Google Cloud SDK installed.

### Choose a model, a project, and a region to host the model

Choose a Gemma 3 model to use, a Google Cloud project to host your Cloud Run service, and a region to host it in.

If you don't have a project yet:

1. [Create a project](https://console.cloud.google.com/projectcreate) in the Google Cloud Console.
2. Copy your `Project ID` from the project's [Settings page](https://console.cloud.google.com/iam-admin/settings).

The project must have `Total Nvidia L4 GPU allocation, per project per region` quota allocated in the selected region.
To make sure it's available, check Cloud Run in the [Quotas and system limits page](https://console.cloud.google.com/iam-admin/quotas).

In [None]:
# { display-mode: "form", run: "auto" }

MODEL = "gemma3:4b"  # @param {type:"string", isTemplate: true}

PROJECT_ID = "[your-project-id]"  # @param {type:"string", isTemplate: true}
REGION = "us-central1"  # @param {type:"string", isTemplate: true}

if PROJECT_ID == "[your-project-id]" or not PROJECT_ID:
    print("Please specify your project id in PROJECT_ID variable.")
    raise KeyboardInterrupt

MODEL_NAME_ESCAPED = MODEL.translate(str.maketrans(".:/", "---"))
SERVICE_NAME = f"ollama--{MODEL_NAME_ESCAPED}"

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Run the cell below.

In [None]:
!gcloud auth print-identity-token -q &> /dev/null || gcloud auth login --project="{PROJECT_ID}" --update-adc --quiet

## Prepare container image

First, let's create a Docker file for a container with the model embedded into it.

In [None]:
%%writefile Dockerfile

FROM ollama/ollama:0.6.0

ARG MODEL

# Set the model name
ENV MODEL=$MODEL

# Set the host and port to listen on
ENV OLLAMA_HOST 0.0.0.0:8080

# Set the directory to store model weight files
ENV OLLAMA_MODELS /models

# Reduce the verbosity of the logs
ENV OLLAMA_DEBUG false

# Do not unload model weights from the GPU
ENV OLLAMA_KEEP_ALIVE -1

# Start the ollama server and download the model weights
RUN ollama serve & sleep 5 && ollama pull $MODEL

# At startup time we start the server and run a dummy request
# to request the model to be loaded in the GPU memory
ENTRYPOINT ["/bin/sh"]
CMD ["-c", "ollama serve  & (ollama run $MODEL 'Say one word' &) && wait"]

Second, we create a Cloud Build file to use for building and pushing our container image.

In [None]:
%%writefile cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  id: build
  entrypoint: 'bash'
  args:
    - -c
    - |
        docker buildx build --tag=${_IMAGE} --build-arg MODEL=${_MODEL} .

images: ["${_IMAGE}"]

substitutions:
  _IMAGE: '${_REGION}-docker.pkg.dev/${PROJECT_ID}/${_AR_REPO_NAME}/${_SERVICE_NAME}'

options:
  dynamicSubstitutions: true
  machineType: "E2_HIGHCPU_32"

## Build Container Image and Deploy Cloud Run Service

We are ready to build our container image and deploy Cloud Run service.

The script below performs the following actions:

* Enables necessary APIs.
* Creates an Artifact Repository for the image.
* Creates a Service Account for the service.
* Submits a Cloud Build job to create and push the container image.
* Deploys the Cloud Run service.

> The script may take 10-45 minutes to finish.

Note the following important flags in Cloud Build deployment command:

* `--concurrency 4` is set to match the value of the environment variable `OLLAMA_NUM_PARALLEL`.
* `--gpu 1` with `--gpu-type nvidia-l4` assigns 1 NVIDIA L4 GPU to every Cloud Run instance in the service.
`--no-allow-authenticated` restricts unauthenticated access to the service.
By keeping the service private, you can rely on Cloud Run's built-in [Identity and Access Management (IAM)](https://cloud.google.com/iam) authentication for service-to-service communication.
* `--no-cpu-throttling` is required for enabling GPU.
* `--service-account` the service identity of the service.
* `--max-instances` sets maximum number of instances of the service.
It has to be equal to or lower than your project's NVIDIA L4 GPU (`Total Nvidia L4 GPU allocation, per project per region`) quota.

For optimal GPU utilization, increase `--concurrency`, keeping it within twice the value of `OLLAMA_NUM_PARALLEL`.
While this leads to request queuing in Ollama, it can help improve utilization:
Ollama instances can immediately process requests from their queue, and the queues help absorb traffic spikes.

In [None]:
%%writefile deploy.sh

PROJECT_ID=$1
REGION=$2
MODEL_ID="${3}"
SERVICE_NAME="${4}"
AR_REPO_NAME="ollama-repo"
SERVICE_ACCOUNT="ollama-cloud-run-sa"
SERVICE_ACCOUNT_ADDRESS="${SERVICE_ACCOUNT}@$PROJECT_ID.iam.gserviceaccount.com"
MAX_INSTANCES=1 # Adjust this value to match your Cloud Run L4 GPU quota ("Total Nvidia L4 GPU allocation, per project per region", NvidiaL4GpuAllocPerProjectRegion, run.googleapis.com/nvidia_l4_gpu_allocation)

echo "Enabling APIs in project ${PROJECT_ID}."
gcloud services enable run.googleapis.com \
    cloudbuild.googleapis.com \
    artifactregistry.googleapis.com \
    --project ${PROJECT_ID} \
    --quiet

set -e

# Creating the service account if doesn't exist.
sa_list=$(gcloud iam service-accounts list --quiet --format 'value(email)' --project $PROJECT_ID --filter=email:$SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com 2>/dev/null)
if [ -z "${sa_list}" ]; then
    echo "Creating Service Account ${SERVICE_ACCOUNT}."
    gcloud iam service-accounts create $SERVICE_ACCOUNT \
        --project ${PROJECT_ID} \
        --display-name="${SERVICE_ACCOUNT} - Cloud Run Service Account"
fi

# Creating the Artifacts Repository if doesn't exist
repo_list=$(gcloud artifacts repositories list --format 'value(name)' --filter=name="projects/${PROJECT_ID}/locations/${REGION}/repositories/${AR_REPO_NAME}" --project ${PROJECT_ID} --quiet --location ${REGION} 2>/dev/null)
if [ -z "${repo_list}" ]; then
    echo "Creating Artifact Registry ${AR_REPO_NAME}."
    gcloud artifacts repositories create $AR_REPO_NAME \
    --repository-format docker \
    --location ${REGION} \
    --project=${PROJECT_ID}
fi

echo "Building container image."
gcloud builds submit --config=cloudbuild.yaml --project=${PROJECT_ID} . \
    --suppress-logs \
    --substitutions \
  _AR_REPO_NAME=$AR_REPO_NAME,_REGION=$REGION,_SERVICE_NAME=$SERVICE_NAME,_MODEL=$MODEL_ID
rm -f cloudbuild.yaml
rm -f Dockerfile

echo "Deploying Service ${SERVICE_NAME}."
gcloud beta run deploy $SERVICE_NAME \
    --project=${PROJECT_ID} \
    --image=${REGION}-docker.pkg.dev/$PROJECT_ID/$AR_REPO_NAME/$SERVICE_NAME \
    --service-account $SERVICE_ACCOUNT_ADDRESS \
    --cpu=8 \
    --memory=32Gi \
    --gpu=1 --gpu-type=nvidia-l4 \
    --concurrency 4 \
    --set-env-vars OLLAMA_NUM_PARALLEL=4 \
    --region ${REGION} \
    --no-allow-unauthenticated \
    --max-instances ${MAX_INSTANCES} \
    --no-cpu-throttling \
    --timeout 1h

SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} --project=${PROJECT_ID} --region $REGION --format 'value(status.url)' --quiet)
echo "✅ Success!"
echo "🚀 Service URL: ${SERVICE_URL}"

In [None]:
!/bin/bash ./deploy.sh "{PROJECT_ID}" "{REGION}" "{MODEL}" "{SERVICE_NAME}" && rm -f ./deploy.sh

## Test the deployed service

Now, let's test the service you deployed.

First, simply by using `cURL`.

In [None]:
%%bash -s $MODEL $SERVICE_NAME $PROJECT_ID $REGION

PROMPT="Hello!"
SERVICE_URL=$(gcloud run services describe ${2} --project ${3} --region ${4} --format 'value(status.url)' --quiet)
AUTH_TOKEN=$(gcloud auth print-identity-token -q)

curl -s -X POST "${SERVICE_URL}/api/generate" \
-H "Authorization: Bearer ${AUTH_TOKEN}" \
-H "Content-Type: application/json" \
-d '{ "model": "'${1}'", "prompt": "'${PROMPT}'", "max_tokens": 100, "stream": false}'

### Ollama Python Library

You can also use Ollama Python Library to make requests to the service you deployed.

In [None]:
# Install Ollama Python Library
%pip install ollama -q

In [None]:
import subprocess

from ollama import Client

identity_token = (
    subprocess.check_output("gcloud auth print-identity-token -q", shell=True)
    .decode()
    .strip()
)
service_url = (
    subprocess.check_output(
        (
            "gcloud run services describe "
            f"{SERVICE_NAME} --project={PROJECT_ID} "
            f"--region={REGION} "
            "--format='value(status.url)' -q"
        ),
        shell=True,
    )
    .decode()
    .strip()
)
client = Client(host=service_url, headers={"Authorization": f"Bearer {identity_token}"})
stream = client.chat(
    model=MODEL,
    messages=[{"role": "user", "content": "Why is the sky blue?"}],
    stream=True,
)
for chunk in stream:
    print(chunk["message"]["content"], end="", flush=True)

## Conclusion
Congratulations! 💎 Now you know how to deploy Gemma 3 with Ollama to Cloud Run powered by a GPU!

## Cleaning up

To delete the Cloud Run service you created, you can uncomment and run the following cell.

In [None]:
# !gcloud run services delete $SERVICE_NAME --project $PROJECT_ID --region $LOCATION --quiet