In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Building a Multimodal Trip Planner with ADK on Vertex AI Agent Engine Memory Bank

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fagents%2Fagent_engine%2Fmemory_bank%2Ftutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/agents/agent_engine/memory_bank/tutorial_get_started_with_multimodal_agents_with_memory_bank.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Authors |
| --- |
| [Yee Sian](https://github.com/yeesian) |
| [Shawn Yang](https://github.com/shawn-yang-google) |
| [Kimberly Milam](https://github.com/klmilam) |
| [Ivan Nardini](https://github.com/inardini) |

<div class="alert alert-block alert-warning">
    <b>Warning:</b> Vertex AI Memory Bank is in Preview. Multimodal memories in this tutorial will depend on the dimensions of your multimodal inputs and the customization features you leverage.
</div>

## Overview

Imagine a trip planner that remembers your travel photos from Paris, understands your video tour of Tokyo's streets, and recalls your voice notes about that perfect gelato shop in Rome. This tutorial teaches you to build exactly that—a multimodal trip planner powered by Memory Bank, the managed memory service of Vertex AI Agent Engine.

### What You'll Learn

- **Multimodal Memory Generation**: Extract memories from images, videos, and audio
- **Smart Retrieval**: Fetch relevant memories based on context and similarity
- **Agent Development**: Build an ADK agent that thinks like a travel companion using your multimodal inputs
- **Agent Deployment**: Deploy your multimodal agent to Vertex AI Agent Engine

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet "google-cloud-aiplatform[agent_engines, adk]==1.115.0" "google-adk==1.14.1"

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

from vertexai import Client

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}  # fmt: skip
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}  # fmt: skip
if not BUCKET_NAME or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID

BUCKET_URI = f"gs://{BUCKET_NAME}"

# Set variables for ADK
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "1"
APP_NAME = "trip_planner"
USER_ID = "traveler_123"

# Create the staging bucket for Agent Engine
!gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI

client = Client(project=PROJECT_ID, location=LOCATION)

### Import libraries

We import all the Python libraries we'll need.

In [None]:
import asyncio
import warnings
from typing import Any
from IPython.display import Image, Video, Audio, display

warnings.filterwarnings("ignore")
from google.adk import tools
from google.adk.agents import LlmAgent
from google.adk.memory import VertexAiMemoryBankService
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService, VertexAiSessionService
from google.adk.tools import FunctionTool
from google.genai import types
from vertexai  import types as vertexai_types
from vertexai.agent_engines import AgentEngine
from vertexai import types as vertexai_types
from vertexai.types import ManagedTopicEnum
from vertexai.types import MemoryBankCustomizationConfigMemoryTopic as MemoryTopic
from vertexai.types import (
    MemoryBankCustomizationConfigMemoryTopicCustomMemoryTopic as CustomMemoryTopic,
)
from vertexai.types import (
    MemoryBankCustomizationConfigMemoryTopicManagedMemoryTopic as ManagedMemoryTopic,
)

### Helpers

We define some helper functions you will use along the tutorial.

In particular, the `MultimodalMemoryGenerator` takes different media types (image, video, audio) along with some text context, format them correctly for the Memory Bank API, and trigger the memory generation process.

> Note: Memory Bank doesn't just store the raw files. It uses Gemini and its multimodal capabilities to process the file and the context, extracting key facts and concepts. These extracted facts are what become the "memories."

The other helpers, `_parse_memories`, `call_agent`, and `call_remote_agent`, are utilities for extracting API responses and running our agent, both locally and when it's deployed.


In [None]:
def display_image(image_uri: str, width: int = None, height: int = None):
    """Displays an image from a given URI with optional width and height.
    """
    display(Image(image_uri, width=width, height=height))

def display_video(video_uri: str, width: int = None, height: int = None):
    """Displays a video from a given URI with optional width and height.
    """
    display(Video(video_uri, width=width, height=height))

def display_audio(audio_uri: str):
    """Displays an audio file from a given URI.
    """
    display(Audio(audio_uri))

class MultimodalMemoryGenerator:
    """Handles memory generation from various media types."""

    def __init__(self, client, agent_engine_name):
        self.client = client
        self.agent_engine_name = agent_engine_name

    def generate_from_image(self, image_uri: str, context: str, user_id: str):
        """Generate memories from travel photos."""
        mime_type = image_uri.split(".")[-1]
        if mime_type not in ["webp", "jpeg", "png", "jpg"]:
            raise ValueError(f"Unsupported image file type: {mime_type}")
        if mime_type == "jpg":
            mime_type = "jpeg"
        mime_type = f"image/{mime_type}"

        events = [
            {
                "content": {
                    "role": "user",
                    "parts": [
                        {"text": context},
                        {"file_data": {"file_uri": image_uri, "mime_type": mime_type}},
                    ],
                }
            }
        ]

        response = self.client.agent_engines.memories.generate(
            name=self.agent_engine_name,
            direct_contents_source={"events": events},
            scope={"user_id": user_id},
            config={"wait_for_completion": True},
        )

        return self._parse_memories(response)

    def generate_from_video(self, video_uri: str, context: str, user_id: str):
        """Generate memories from travel videos."""
        mime_type = video_uri.split(".")[-1]
        if mime_type not in [
            "mp4",
            "mpeg",
            "x-flv",
            "quicktime",
            "mpegps",
            "mpg",
            "webm",
            "wmv",
            "3gpp",
        ]:
            raise ValueError(f"Unsupported video file type: {mime_type}")
        mime_type = f"video/{mime_type}"

        events = [
            {
                "content": {
                    "role": "user",
                    "parts": [
                        {"text": context},
                        {"file_data": {"file_uri": video_uri, "mime_type": mime_type}},
                    ],
                }
            }
        ]

        response = self.client.agent_engines.memories.generate(
            name=self.agent_engine_name,
            direct_contents_source={"events": events},
            scope={"user_id": user_id},
            config={"wait_for_completion": True},
        )

        return self._parse_memories(response)

    def generate_from_audio(self, audio_uri: str, context: str, user_id: str):
        """Generate memories from voice notes about travel."""
        mime_type = audio_uri.split(".")[-1]
        if mime_type not in [
            "aac",
            "flac",
            "mp3",
            "m4a",
            "mpeg",
            "mpga",
            "mp4",
            "opus",
            "pcm",
            "wav",
            "webm",
        ]:
            raise ValueError(f"Unsupported audio file type: {mime_type}")
        mime_type = f"audio/{mime_type}"

        events = [
            {
                "content": {
                    "role": "user",
                    "parts": [
                        {"text": f"Voice note context: {context}"},
                        {"file_data": {"file_uri": audio_uri, "mime_type": mime_type}},
                    ],
                }
            }
        ]

        operation = self.client.agent_engines.memories.generate(
            name=self.agent_engine_name,
            direct_contents_source={"events": events},
            scope={"user_id": user_id},
            config={"wait_for_completion": True},
        )

        return self._parse_memories(operation)

    def _parse_memories(self, operation):
        """Parse and format generated memories."""
        memories = []
        if operation.response and operation.response.generated_memories:
            for gen_memory in operation.response.generated_memories:
                if gen_memory.action != "DELETED" and gen_memory.memory:
                    try:
                        full_memory = client.agent_engines.memories.get(
                            name=gen_memory.memory.name
                        )
                        memories.append({"fact": full_memory.fact})
                    except Exception as e:
                        print(f"Could not get any memory: {e}")
        else:
            print("No memories generated")
        return memories


def call_agent(runner: Runner, content: str, session_id: str, user_id: str):
    """Calls the agent and prints the final response."""
    events = runner.run(user_id=user_id, session_id=session_id, new_message=content)

    for event in events:
        if event.is_final_response():
            final_response = event.content.parts[0].text
            print("=" * 50)
            print("Agent Response:", final_response)
            print("=" * 50)
            return final_response


async def call_remote_agent(
    agent: AgentEngine, session_id: str, message: dict, user_id: str
):
    """Sends a message to the deployed agent, streams the response,
    and prints only the final text output.

    Args:
        session_id: The ID of the current conversation session.
        message: The message payload, including text and/or file data.
    """
    final_response = ""
    async for event in agent.async_stream_query(
        user_id=user_id,
        session_id=session_id,
        message=message,
    ):
        if event.get("content"):
            for part in event["content"].get("parts", []):
                if "text" in part:
                    final_response = part["text"]
                    print("=" * 50)
                    print("Agent Response:", final_response)
                    print("=" * 50)
                    return final_response

## Creating multimodal Memory Bank with Travel-Specific Topics


### Create Topics

Before we can store memories, we can define their structure. Memory Bank uses "topics" to organize information and allows you to customize them in a way to extract memories that are relevant for your use case.

We will use a mix of:
- Managed Topics: Pre-defined concepts for common concepts like USER_PREFERENCES.
- Custom Topics: Concepts we define ourselves, tailored to our specific domain—in this case, travel. This is where we define the schema for our agent's memory. Memory Bank uses 'topics' to categorize information, which helps the underlying model know what's important to extract and save.  

In our case, we're defining travel_experiences, travel_preferences, and travel_logistics. The description for each custom topic acts as a prompt, guiding the model on what kind of facts to extract for that category.


In [None]:
travel_topics = [
    MemoryTopic(
        managed_memory_topic=ManagedMemoryTopic(
            managed_topic_enum=ManagedTopicEnum.USER_PREFERENCES
        )
    ),
    MemoryTopic(
        managed_memory_topic=ManagedMemoryTopic(
            managed_topic_enum=ManagedTopicEnum.USER_PERSONAL_INFO
        )
    ),
    MemoryTopic(
        custom_memory_topic=CustomMemoryTopic(
            label="travel_experiences",
            description="""Memorable travel experiences including:
                - Places visited and impressions
                - Favorite restaurants, cafes, and food experiences
                - Preferred accommodation types and locations
                - Activities enjoyed (museums, hiking, beaches, etc.)
                - Travel companions and social preferences
                - Photos and videos from trips with location context""",
        )
    ),
    MemoryTopic(
        custom_memory_topic=CustomMemoryTopic(
            label="travel_preferences",
            description="""Travel style and preferences:
                - Budget preferences (luxury, mid-range, budget)
                - Transportation preferences (flying, trains, driving)
                - Trip duration preferences
                - Season and weather preferences
                - Cultural interests and language abilities
                - Dietary restrictions and food preferences""",
        )
    ),
    MemoryTopic(
        custom_memory_topic=CustomMemoryTopic(
            label="travel_logistics",
            description="""Practical travel information:
                - Passport and visa information
                - Frequent flyer numbers and hotel loyalty programs
                - Emergency contacts
                - Medical considerations and insurance
                - Packing preferences and essentials
                - Time zone preferences and jet lag strategies""",
        )
    ),
]

### Configure Memory Bank with our travel topics

Now we assemble the full memory_bank_config, which will be passed to our Agent Engine when we create it.

This config is composed of a few key parts.

The customization_configs is where we pass in our travel_topics list from the previous cell, telling Memory Bank to use our custom schema. For similarity_search_config, we're specifying gemini-embedding-001. This model is used for similarity searches when retrieving memories. Finally, the generation_config specifies gemini-2.5-flash as the model. This model will be used for the generation of memories (i.e., processing the inputs and extracting the facts).

In [None]:
memory_bank_config = {
    "customization_configs": [
        {
            "memory_topics": travel_topics,
        }
    ],
    "similarity_search_config": {
        "embedding_model": f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/gemini-embedding-001"
    },
    "generation_config": {
        "model": f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/gemini-2.5-flash"
    },
}

### Create Agent Engine with Memory Bank

This is a key step. We're calling client.agent_engines.create() to create a new Agent Engine instance.

An Agent Engine is the managed Vertex AI resource that hosts our agent and its associated services. Crucially, we are not deploying our agent code yet. We are just provisioning the infrastructure that includes the Memory Bank.

We pass our memory_bank_config inside the context_spec dictionary. This associates our custom memory schema (topics, models) with this specific Agent Engine. This call will create a persistent resource in your project that you can see in the Google Cloud console.

In [None]:
agent_engine = client.agent_engines.create(
    config={
        "display_name": "Multimodal Memory Bank",
        "description": "Multimodal Memory Bank with Travel-Specific Topics",
        "context_spec": {"memory_bank_config": memory_bank_config},
    }
)

### Test the memory

Let's instantiate our MultimodalMemoryGenerator helper class. We pass it the client (for making API calls) and the agent_engine.api_resource.name. This resource name is the unique identifier for the Agent Engine we just created, telling the SDK which Memory Bank to send data to.

In [None]:
memory_gen = MultimodalMemoryGenerator(client, agent_engine.api_resource.name)

#### Generate memories for audio input

Finally, let's ingest an audio file. This .wav file is a voice note about a trip to Gaeta, Italy. We provide the file and the context "I am travelling in Gaeta."

In [None]:
audio_uri="https://storage.googleapis.com/github-repo/audio_ai/gaeta.wav"
display_audio(audio_uri=audio_uri)

In [None]:
audio_memories = memory_gen.generate_from_audio(
    audio_uri="gs://github-repo/audio_ai/gaeta.wav",
    context="I am travelling in Gaeta",
    user_id="traveler_123",
)

This output is more detailed! Because the audio file itself contained a rich description (castles, beaches, local food), Memory Bank was able to extract multiple facts from that single file, all of which fall under our custom travel_experiences topic.

This demonstrates the importance of using multimodal inputs for memory generation.

In [None]:
print("Generated memories from audio:")
for memory in audio_memories:
    print(f"  - {memory['fact']}")

#### Generate memories for image

Time for our first multimodal memory! We're calling `generate_from_image` passing a photo of Marienplatz, a text context ("My visit to Germany last summer..."), and the user_id. This user_id is important as it scopes the memory to this specific user.


Behind the scenes, our helper class calls the `agent_engines.memories.generate()` API. Memory Bank receives both the image and the text, uses a multimodal model to understand them together, and then uses our generation_config model (Gemini 2.5 Flash) to extract facts that match our travel_topics.

In [None]:
image_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark1.jpg"
display_image(image_uri, width=500, height=500)

In [None]:
image_memories = memory_gen.generate_from_image(
    image_uri=image_uri,
    context="My visit to Germany last summer, walking around Marienplatz in Munich",
    user_id="traveler_123",
)

Let's check the output. The _parse_memories helper in our class retrieves the full memory fact. As you can see, the model correctly extracted the key information and saved it as a simple, factual statement.


In [None]:
print("Generated memories from image:")
for memory in image_memories:
    print(f"  - {memory['fact']}")

#### Generate memories for video input

Now let's try with a video. The process is identical: we provide an .mp4 file and a simple text context. The MultimodalMemoryGenerator class handles setting the correct video/mp4 MIME type.


In [None]:
video_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4"
display_video(video_uri=video_uri)

In [None]:
video_memories = memory_gen.generate_from_video(
    video_uri="https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4",
    context="I am travelling in Europe",
    user_id="traveler_123",
)

Again, we print the result. The model distills the input into a core fact.


In [None]:
print("Generated memories from video:")
for memory in video_memories:
    print(f"  - {memory['fact']}")

## Building the Trip Planning Agent with multimodal memory

Time to build our multimodal agent using ADK.

### Build custom tools

We're defining a simple Python function `calculate_trip_budget`. The key parts are the type hints (destination: str, days: int) and the docstring. The ADK framework and the underlying LLM will inspect this docstring (especially the description "Calculates estimated budget for a trip.") to understand when and how to use this tool.


In [None]:
def calculate_trip_budget(destination: str, days: int, style: str) -> dict[str, Any]:
    """Calculates estimated budget for a trip.

    Use this tool when the user asks about trip costs or budget estimates.

    Args:
        destination: The destination city or country
        days: Number of days for the trip
        style: Travel style (budget, mid-range, luxury)

    Returns:
        A dictionary with budget breakdown.
        Example: {'status': 'success', 'total': 3000, 'daily': 500, 'currency': 'USD'}
    """
    daily_rates = {"budget": 100, "mid-range": 250, "luxury": 500}

    daily = daily_rates.get(style.lower(), 250)
    total = daily * days

    return {
        "status": "success",
        "total": total,
        "daily": daily,
        "currency": "USD",
        "breakdown": {
            "accommodation": daily * 0.4,
            "food": daily * 0.3,
            "activities": daily * 0.2,
            "transport": daily * 0.1,
        },
    }

Here, we wrap our Python function in the ADK's FunctionTool class. This converts the function into an object that the ADK LlmAgent can understand and execute.


In [None]:
budget_tool = FunctionTool(func=calculate_trip_budget)

### Create Enhanced Agent with Tools

This is where we define the 'brain' of our agent using the ADK's LlmAgent class.

We specify gemini-2.5-flash for fast, intelligent responses.

The instruction parameter is the system prompt. We're telling it who it is ("an advanced trip planning assistant"), what it can do ("access to user's complete travel history"), and how to behave ("Always mention what memories influenced your suggestions").

Finally, the tools list defines what the agent can do. We're giving it our custom budget_tool and, just as importantly, the built-in tools.preload_memory_tool.PreloadMemoryTool(). This built-in tool is what's responsible for automatically fetching relevant memories from Memory Bank at the start of a new conversation turn.


In [None]:
trip_planner_agent = LlmAgent(
    model="gemini-2.5-flash",
    name="TripPlanner",
    instruction="""You are an advanced trip planning assistant with memory and tools.

    Your capabilities:
    - Access to user's complete travel history through memories
    - Use 'calculate_trip_budget' to provide cost estimates
    - Remember all conversations and preferences

    Tool usage guidelines:
    - Always mention what memories influenced your suggestions
    - After providing budget, ask if they want adjustments

    Be personal and reference specific past experiences when available.
    If not available, just keep talking with the user. Don't make up facts.
    """,
    tools=[tools.preload_memory_tool.PreloadMemoryTool(), budget_tool],
)

### Test the agent

Let's test our agent. For the sake of a self-contained test, this code creates a new, empty Agent Engine. This means the test that follows will first populate this new memory, and then test retrieval.

In [None]:
agent_engine = client.agent_engines.create(
    config={
        "display_name": "Multimodal Memory Bank",
        "description": "Multimodal Memory Bank with Travel-Specific Topics",
        "context_spec": {"memory_bank_config": memory_bank_config},
    }
)

Before we can run our agent locally, we need to configure its services. The ADK Runner ties everything together.

It needs a session_service to store short-term conversation history; InMemorySessionService is perfect for local testing as it just keeps the session in your notebook's RAM.

It also needs a memory_service for long-term memory. We instantiate VertexAiMemoryBankService and point it to our deployed Agent Engine's ID. This tells the ADK LlmAgent (and its PreloadMemoryTool) which cloud-based Memory Bank to connect to.


In [None]:
session_service = InMemorySessionService()

memory_bank_service = VertexAiMemoryBankService(
    agent_engine_id=agent_engine.api_resource.name.split("/")[-1],
    project=PROJECT_ID,
    location=LOCATION,
)

runner = Runner(
    agent=trip_planner_agent,
    app_name=APP_NAME,
    session_service=session_service,
    memory_service=memory_bank_service,
)

This async function defines a complete, multi-turn test conversation with our local agent. It's designed to simulate a user providing multiple pieces of multimodal information across several turns.

It starts by creating a session and sends a series of messages (text, image, video, audio) using our call_agent helper.

The most critical step is at the end: memory_bank_service.add_session_to_memory(final_session_state). This is the
consolidation step. It takes the entire conversation from the
InMemorySessionService and sends it to the VertexAiMemoryBankService to be processed and saved permanently.

To prove it worked, the function then starts a new session and asks a question ("suggest a cultural destination...") to prove that the agent can recall and use the memories from the previous session.


In [None]:
async def test_trip_planner():
    session = await session_service.create_session(app_name=APP_NAME, user_id=USER_ID)
    print(f"🌍 Starting a new trip planning session: {session.id}")

    image_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark1.jpg"
    video_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4"
    audio_uri = "gs://github-repo/audio_ai/gaeta.wav"

    text_message = types.Content(role="user", parts=[{"text": "Hello!"}])

    call_agent(content=text_message, session_id=session.id, user_id=USER_ID)
    asyncio.sleep(5)

    mime_type = image_uri.split(".")[-1]
    if mime_type not in ["webp", "jpeg", "png", "jpg"]:
        raise ValueError(f"Unsupported image file type: {mime_type}")
    if mime_type == "jpg":
        mime_type = "jpeg"
    mime_type = f"image/{mime_type}"

    image_message = types.Content(
        role="user",
        parts=[
            {
                "text": "I'm planning a trip. First, here is a picture that shows you the kind of place I like."
            },
            {"file_data": {"file_uri": image_uri, "mime_type": mime_type}},
        ],
    )
    call_agent(content=image_message, session_id=session.id, user_id=USER_ID)
    asyncio.sleep(5)

    mime_type = video_uri.split(".")[-1]
    if mime_type not in [
        "mp4",
        "mpeg",
        "x-flv",
        "quicktime",
        "mpegps",
        "mpg",
        "webm",
        "wmv",
        "3gpp",
    ]:
        raise ValueError(f"Unsupported video file type: {mime_type}")
    mime_type = f"video/{mime_type}"
    video_message = types.Content(
        role="user",
        parts=[
            {
                "text": "Next, here's a video. I also enjoy cities close to Mediterranean sea."
            },
            {"file_data": {"file_uri": video_uri, "mime_type": mime_type}},
        ],
    )
    call_agent(content=video_message, session_id=session.id, user_id=USER_ID)
    asyncio.sleep(5)

    mime_type = audio_uri.split(".")[-1]
    if mime_type not in [
        "aac",
        "flac",
        "mp3",
        "m4a",
        "mpeg",
        "mpga",
        "mp4",
        "opus",
        "pcm",
        "wav",
        "webm",
    ]:
        raise ValueError(f"Unsupported audio file type: {mime_type}")
    mime_type = f"audio/{mime_type}"
    audio_message = types.Content(
        role="user",
        parts=[
            {"text": "Finally, I loved Gaeta. To give you an idea, Here is an audio"},
            {"file_data": {"file_uri": audio_uri, "mime_type": mime_type}},
        ],
    )
    call_agent(content=audio_message, session_id=session.id, user_id=USER_ID)
    asyncio.sleep(5)

    print("\n---------------------------------------------------")
    print("Conversation finished. Consolidating all memories at once...")
    final_session_state = await session_service.get_session(
        app_name=APP_NAME, user_id=USER_ID, session_id=session.id
    )
    await memory_bank_service.add_session_to_memory(final_session_state)
    print("✅ Full conversation context (Image, Video, Audio) saved to Memory Bank.")
    print("---------------------------------------------------")

    new_session = await session_service.create_session(
        app_name=APP_NAME, user_id=USER_ID
    )
    print(
        f"\n🌅 Starting a NEW session ({new_session.id}) to test cumulative memory..."
    )

    text_message = types.Content(role="user", parts=[{"text": "Hello!"}])

    call_agent(text_message, new_session.id, USER_ID)

    verification_message = types.Content(
        role="user",
        parts=[
            {
                "text": "Based on the picture, video, AND audio I shared with you before, suggest a cultural destination for me."
            }
        ],
    )

    call_agent(verification_message, new_session.id, USER_ID)

Let's run the test! Watch the output. The first few turns are the agent acknowledging the new information (image, video, audio). After the 'Consolidating all memories' message, a new session starts.

Notice how the agent's response to "Hello!" is now different. It says "Hello again!" and immediately references past memories. This is the PreloadMemoryTool in action!

The final response is a detailed recommendation for Naples, which explicitly references the image, the video, and the audio note about Gaeta, demonstrating a sophisticated, context-aware agent using its multimodal memory.


In [None]:
await test_trip_planner()

## Deploy the agent on Vertex AI Agent Engine

### Deploy the agent

It's time to deploy your agent! So far, our LlmAgent has only been running locally in this notebook. We will now deploy our agent to the cloud as a scalable, serverless endpoint.

We call `client.agent_engines.create` again, but this time we pass our local agent=trip_planner_agent object. The `config` dictionary now defines the deployment itself. We provide a `display_name` and, critically, pass our `memory_bank_config` again in the context_spec. This links our deployed agent code to the same Memory Bank configuration. We also provide
requirements and the `staging_bucket` URI we defined at the start.

The create call will automatically "pickle" our trip_planner_agent object, bundle it with our requirements, and upload it to this bucket. This process takes a few minutes as it builds a container image and provisions the serving infrastructure.

In [None]:
remote_mm_adk_agent = client.agent_engines.create(
    agent=trip_planner_agent,
    config={
        "display_name": "Multimodal Memory Bank",
        "description": "Multimodal Memory Bank with Travel-Specific Topics",
        "context_spec": {"memory_bank_config": memory_bank_config},
        "requirements": [
            "google-cloud-aiplatform[agent_engines,adk]==1.115.0",
            "google-adk==1.14.1",
        ],
        "staging_bucket": BUCKET_URI,
    },
)

### Submit a multimodal query

The `create` command returns a client object representing the deployed agent.  We'll grab its api_resource.name, which is the full, unique identifier for our deployed agent. We need this name to send it queries. And we initialize the new agent engine endpoint.

In [None]:
remote_mm_adk_agent_name = remote_mm_adk_agent.api_resource.name
remote_mm_adk_agent = agent_engines.get(remote_mm_adk_agent_name)

Now you are ready to hit the endpoint. Here you have a test function which is similar to our local test, but it's designed to talk to the deployed agent.  The function uses `remote_mm_adk_agent.async_create_session()` to create a new session ID and formats the messages (e.g., image_message) as simple dictionaries for the async_stream_query API. It also explicitly performing the session retrieval and memory consolidation from the client side using `VertexAiSessionService` and `VertexAiMemoryBankService`. This demonstrates how you can have fine-grained control over memory persistence, even with a deployed agent, just as we did in the local test.

In [None]:
async def test_remote_trip_planner():
    """Simulates a multi-turn, multimodal conversation with the deployed agent,
    mirroring the exact structure of the original notebook's test function.
    """
    conversation_session = await remote_mm_adk_agent.async_create_session(
        user_id=USER_ID
    )
    session_id = conversation_session["id"]
    print(f"🌍 Starting a new trip planning session: {session_id}\n")

    image_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark1.jpg"
    video_uri = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4"
    audio_uri = "gs://github-repo/audio_ai/gaeta.wav"

    text_message = {"role": "user", "parts": [{"text": "Hello!"}]}
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=session_id,
        message=text_message,
    )
    await asyncio.sleep(5)

    mime_type = image_uri.split(".")[-1]
    if mime_type not in ["webp", "jpeg", "png", "jpg"]:
        raise ValueError(f"Unsupported image file type: {mime_type}")
    if mime_type == "jpg":
        mime_type = "jpeg"
    mime_type = f"image/{mime_type}"
    image_message = {
        "role": "user",
        "parts": [
            {
                "text": "I'm planning a trip. First, here is a picture that shows you the kind of place I like."
            },
            {"file_data": {"file_uri": image_uri, "mime_type": mime_type}},
        ],
    }
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=session_id,
        message=image_message,
    )
    await asyncio.sleep(5)

    mime_type = video_uri.split(".")[-1]
    if mime_type not in [
        "mp4",
        "mpeg",
        "x-flv",
        "quicktime",
        "mpegps",
        "mpg",
        "webm",
        "wmv",
        "3gpp",
    ]:
        raise ValueError(f"Unsupported video file type: {mime_type}")
    mime_type = f"video/{mime_type}"
    video_message = {
        "role": "user",
        "parts": [
            {
                "text": "Next, here's a video. I also enjoy cities close to the Mediterranean sea."
            },
            {"file_data": {"file_uri": video_uri, "mime_type": mime_type}},
        ],
    }
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=session_id,
        message=video_message,
    )
    await asyncio.sleep(5)

    mime_type = audio_uri.split(".")[-1]
    if mime_type not in [
        "aac",
        "flac",
        "mp3",
        "m4a",
        "mpeg",
        "mpga",
        "mp4",
        "opus",
        "pcm",
        "wav",
        "webm",
    ]:
        raise ValueError(f"Unsupported audio file type: {mime_type}")
    mime_type = f"audio/{mime_type}"
    audio_message = {
        "role": "user",
        "parts": [
            {
                "text": "Finally, I loved Gaeta. To give you an idea, here is an audio note about it."
            },
            {"file_data": {"file_uri": audio_uri, "mime_type": mime_type}},
        ],
    }
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=session_id,
        message=audio_message,
    )

    print("\n---------------------------------------------------")
    print(
        "Conversation finished. For a deployed agent, context is automatically saved to Memory Bank."
    )

    vertex_session_service = VertexAiSessionService(PROJECT_ID, LOCATION)
    final_session_state = await vertex_session_service.get_session(
        app_name=remote_mm_adk_agent_name.split("/")[-1],
        user_id=USER_ID,
        session_id=session_id,
    )

    vertex_memory_bank_service = VertexAiMemoryBankService(
        agent_engine_id=remote_mm_adk_agent_name.split("/")[-1],
        project=PROJECT_ID,
        location=LOCATION,
    )
    await vertex_memory_bank_service.add_session_to_memory(final_session_state)
    print("✅ Full conversation context is now persistent.")
    print("---------------------------------------------------\n")
    await asyncio.sleep(10)

    verification_session = await remote_mm_adk_agent.async_create_session(
        user_id=USER_ID
    )
    new_session_id = verification_session["id"]
    print(
        f"🌅 Starting a NEW session ({new_session_id}) to test cumulative memory...\n"
    )

    hello_again_message = {"role": "user", "parts": [{"text": "Hello again!"}]}
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=new_session_id,
        message=hello_again_message,
    )
    await asyncio.sleep(5)

    verification_message = {
        "role": "user",
        "parts": [
            {
                "text": "Based on the picture, video, AND audio I shared with you before, suggest a cultural destination for me."
            }
        ],
    }
    await call_remote_agent(
        agent=remote_mm_adk_agent,
        user_id=USER_ID,
        session_id=new_session_id,
        message=verification_message,
    )

In [None]:
# Run the test
await test_remote_trip_planner()

## Cleaning up

Finally, let's clean up our resources to avoid incurring costs.

In [None]:
delete_agent_engine = True

if delete_agent_engine:
    agent_engines = client.agent_engines.list()
    for agent_engine in agent_engines:
        agent_engine_name = agent_engine.api_resource.name
        if agent_engine.api_resource.display_name == "Multimodal Memory Bank":
            agent_engine.delete(force=True)
            print(f"Deleted agent engine: {agent_engine_name}")