In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Intro to Context Caching with the Gemini API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fcontext-caching%2Fintro_context_caching.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/context-caching/intro_context_caching.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://goo.gle/4jeQyWo">
      <img width="32px" src="https://cdn.qwiklabs.com/assets/gcp_cloud-e3a77215f0b8bfa9b3f611c0d2208c7e8708ed31.svg" alt="Google Cloud logo"><br> Open in  Cloud Skills Boost
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/context-caching/intro_context_caching.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| Authors |
| --- |
| [Eric Dong](https://github.com/gericdong) |
| [Holt Skinner](https://github.com/holtskinner) |

## Overview

The Gemini API provides the context caching feature to store frequently used input tokens in a dedicated cache and use them for subsequent requests, eliminating the need to repeatedly pass the same set of tokens to a model. This feature can help reduce the number of tokens sent to the model, thereby lowering the cost of requests.

The Gemini API offers two different caching mechanisms:

- Implicit caching (automatic, no cost saving guarantee)
- Explicit caching (manual, cost saving guarantee)

For more information, refer to the [context caching overview](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview) page.

### Objectives

In this tutorial, you learn how to use implicit caching and explicit caching with the Google Gen AI SDK in Vertex AI.

You will complete the following tasks:

- Use implicit caching
- Use explicit caching
  - Create a context cache
  - Retrieve and use a context cache
  - Use context caching in Chat
  - Update the expire time of a context cache
  - Delete a context cache

## Get started

### Install Google Gen AI SDK for Python

In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION = "us-central1"  # @param {type:"string"}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

## Code Examples

### Import libraries

In [None]:
from IPython.display import Markdown, display
from google import genai
from google.genai.types import (
    Content,
    CreateCachedContentConfig,
    GenerateContentConfig,
    Part,
)

### Create a client

In [None]:
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Use a supported model

See context caching [supported models](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview#supported_models).

In [None]:
MODEL_ID = "gemini-2.5-flash-preview-05-20"  # @param ["gemini-2.0-flash-001", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06"] {"allow-input":true, isTemplate: true}

## Implicit caching

Implicit caching directly passes cache cost savings to developers without the need to create an explicit cache. Now, when you send a request to one of the Gemini 2.5 models, if the request shares a common prefix as one of previous requests, then it's eligible for a cache hit.

**Note** that implicit caching is enabled by default for all Gemini 2.0 and 2.5 models but cost savings only apply to Gemini 2.5 models. The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro.

### Re-enable caching

By default, Google foundation models cache inputs for Gemini models. If you disabled caching and want to re-enable it, run the following curl command. To run this command, a user must be granted the Vertex AI administrator role `roles/aiplatform.admin`.

For more information about enabling and disabling data caching, see [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/data-governance#enabling-disabling-caching).

In [None]:
os.environ["API_ENDPOINT"] = (
    f"{LOCATION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}"
)
os.environ["PROJECT_ID"] = PROJECT_ID

In [None]:
%%bash

# Enable caching
curl -X PATCH \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}/cacheConfig \
  -d '{
    "name": "projects/${PROJECT_ID}/cacheConfig",
    "disableCache": false
  }'

### Send a request with a large and common content

To increase the chance of an implicit cache hit:

- Put large and common contents at the beginning of your prompt
- Send requests with similar prefix in a short amount of time

In this example, you send a request with an image at the beginning of your prompt and add a user's request/question at the end of the prompt.

In [None]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        Part.from_uri(
            file_uri="https://storage.googleapis.com/cloud-samples-data/generative-ai/image/meal.png",
            mime_type="image/png",
        ),
        "Describe this image.",
    ],
)

display(Markdown(response.text))

### Send requests with similar prefixes

To demonstrate the implicit cache hit, repeat the same request multiple times, and print out the `cached_content_token_count` in the usage metadata which indicates how many tokens in the request were cached.

In [None]:
NUM_ATTEMPTS = 5  # @param {type: "integer"}

for i in range(NUM_ATTEMPTS):
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=[
            Part.from_uri(
                file_uri="https://storage.googleapis.com/cloud-samples-data/generative-ai/image/meal.png",
                mime_type="image/png",
            ),
            "Write a short and engaging blog post based on this picture.",
        ],
    )

    cached_token_count = response.usage_metadata.cached_content_token_count or 0

    print(f"#{i + 1} Attempt")
    print(f"Input tokens: {response.usage_metadata.prompt_token_count}")
    print(f"Cached tokens: {cached_token_count}")
    print(f"Output tokens: {response.usage_metadata.candidates_token_count}")
    print(f"Total tokens: {response.usage_metadata.total_token_count}")
    print()

    if cached_token_count > 0:
        print(response.usage_metadata.cache_tokens_details)
        print("Cached content found, exiting loop.")
        break

## Explicit caching

Using the explicit caching feature, you can pass some content to the model once, cache the input tokens, and then refer to the cached tokens for subsequent requests. The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro.

### Create a context cache

Create a `CachedContent` object specifying the prompt you want to use, including the file and other fields you wish to cache.

**Notes**
- Caches are model specific. You cannot use a cache made with a different model as their tokenization might be slightly different.
- The default expiration time of a context cache is 60 minutes. You can specify a different expiration time using the `ttl` (time to live) or the `expire_time` property.

This example shows how to create a context cache using two large research papers stored in a Cloud Storage bucket, and set the `ttl` to 600s.

- Paper 1: [Gemini: A Family of Highly Capable Multimodal Models](https://arxiv.org/abs/2312.11805)
- Paper 2: [Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context](https://arxiv.org/abs/2403.05530)

In [None]:
system_instruction = """
You are an expert researcher who has years of experience in conducting systematic literature surveys and meta-analyses of different topics.
You pride yourself on incredible accuracy and attention to detail. You always stick to the facts in the sources provided, and never make up new facts.
Now look at the research paper below, and answer the following questions in 1-2 sentences.
"""

cached_content = client.caches.create(
    model=MODEL_ID,
    config=CreateCachedContentConfig(
        contents=[
            Content(
                role="user",
                parts=[
                    Part.from_uri(
                        file_uri="gs://cloud-samples-data/generative-ai/pdf/2312.11805v3.pdf",
                        mime_type="application/pdf",
                    ),
                    Part.from_uri(
                        file_uri="gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
                        mime_type="application/pdf",
                    ),
                ],
            )
        ],
        system_instruction=system_instruction,
        ttl="600s",
    ),
)

You can access the properties of the cached content as example below. You can use its `name` or `resource_name` to reference the contents of the context cache.

**Note**: The `name` of the context cache is also referred to as cache ID.

In [None]:
print(cached_content.name)
print(cached_content.model)
print(cached_content.create_time)
print(cached_content.expire_time)
print(cached_content.usage_metadata)

### Retrieve a context cache

You can use the property `name` to reference the contents of the context cache. For example:

In [None]:
new_cached_content = client.caches.get(name=cached_content.name)

### Use a context cache

To use the context cache, you provide the `cached_content` resource name in the `config` parameter of the `generate_content()` method.

Then you can query the model with a prompt, and the cached content will be used as a prefix to the prompt.

In [None]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents="What is the research goal shared by these research papers?",
    config=GenerateContentConfig(
        cached_content=cached_content.name,
    ),
)

display(Markdown(response.text))

You can check cached_content_token_count in the usage metadata which indicates how many tokens in the request were cached.

In [None]:
print(f"Input tokens: {response.usage_metadata.prompt_token_count}")
print(f"Cached tokens: {response.usage_metadata.cached_content_token_count or 0}")
print(f"Output tokens: {response.usage_metadata.candidates_token_count}")
print(f"Total tokens: {response.usage_metadata.total_token_count}")

### Use context caching in chat

You can use the context cache in a multi-turn chat session.


In [None]:
chat = client.chats.create(
    model=MODEL_ID,
    config=GenerateContentConfig(
        cached_content=cached_content.name,
    ),
)

In [None]:
prompt = """
How do the approaches to responsible AI development and mitigation strategies in Gemini 1.5 evolve from those in Gemini 1.0?
"""

response = chat.send_message(prompt)

display(Markdown(response.text))

In [None]:
prompt = """
Given the advancements presented in Gemini 1.5, what are the key future research directions identified in both papers
for further improving multimodal AI models?
"""

response = chat.send_message(prompt)

print(response.text)

### Update the expiration time of a context cache

The default expiration time of a context cache is 60 minutes. To update the expiration time, update one of the following properties:

`ttl` - The number of seconds that the cache lives after it's created or after the `ttl` is updated before it expires.

`expire_time` - A Timestamp that specifies the absolute date and time when the context cache expires.

In [None]:
cached_content = client.caches.update(
    name=cached_content.name,
    config=CreateCachedContentConfig(
        system_instruction=system_instruction,
        ttl="3600s",
    ),
)

print(cached_content.expire_time)

### Delete a context cache

You can remove content from the cache using the delete operation.

In [None]:
client.caches.delete(name=cached_content.name)

## Next Steps

- Learn more about context caching on the [context caching overview](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview) page.