In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Intro to Computer Use with Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fcomputer-use%2Fintro_computer_use.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/computer-use/intro_computer_use.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<p>
<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/computer-use/intro_computer_use.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>
</p>

| Authors |
| --- |
| [Eric Dong](https://github.com/gericdong) |

## Overview

The **Computer Use** model lets you create agents that can automate tasks on a computer. It works by "seeing" the screen with screenshots and then "acting" with mouse clicks and keyboard inputs.

This is useful for tasks like:

- Automatically filling out forms on websites.
- Testing web applications.
- Researching information, like comparing prices, across different sites.


## Objective

In this tutorial, you will build a simple web automation agent using the Gemini Computer Use model. By the end, you will understand the complete workflow: from sending an initial prompt with a screenshot to executing browser actions and looping until a task is complete.

## Getting Started

### Install the Gen AI SDK and required libraries

In [None]:
%pip install --upgrade --quiet google-genai playwright

> ⚠️ Note: You can ignore the pip's dependency errors.

### Set up Playwright

Playwright is a tool for browser automation. It enables browser control over web browsers like Chromium, Firefox, and WebKit.


In [None]:
%%capture

# Installs Playwright and browsers
!playwright install

# Additional command, mandatory for Linux only
!playwright install-deps

### Authenticate your notebook environment

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries


In [None]:
import os
import time
from types import SimpleNamespace
from typing import List, Tuple

from google import genai
from google.genai.types import (
    ComputerUse,
    Content,
    Environment,
    FunctionCall,
    FunctionResponse,
    FunctionResponseBlob,
    GenerateContentConfig,
    Part,
    Tool,
)
from playwright.async_api import async_playwright

### Set your project information

Update the following variables with your Google Cloud project details, and connect to the Gen AI service on Vertex AI.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "global"  # @param {type: "string"}

# Connect to the Gen AI service on Vertex AI
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Supported Models

This tutorial uses the `gemini-2.5-computer-use-preview-10-2025` model.

In [None]:
# fmt: off
MODEL_ID = "gemini-2.5-computer-use-preview-10-2025"  # @param {type: "string"}
# fmt: on

## Computer Use: Agent Loop

To build a browser control agent, you implement an "agent loop" that continuously cycles through four key steps. This process allows the agent to perform a sequence of actions to achieve a goal.

1. **Send Request to the Model**. Your app sends the goal (e.g., "Find me a flight") and a current screenshot of the screen to the model.

2. **Receive the Model Response**. The model analyzes the screen and sends back a suggested action, like navigate to a URL. It may also include a safety warning for risky actions.

3. **Execute the Received Action**. Your code runs the suggested action. If there's a safety warning, your code must ask the user for confirmation before proceeding.


4. **Capture the New Environment State**. After the action, your code takes a new screenshot. This new screenshot is sent back to the model in the next turn, starting the cycle over again.

## Prerequisites: Setting Up Your Environment

Before you begin, you need to set up two key components:

- **Secure Execution Environment**: For safety, you must run your Computer Use agent in a secure and controlled environment. Good options include a sandboxed virtual machine, a container, or a dedicated browser profile with limited permissions.

- **Client-Side Action Handler**: You need to write client-side logic to execute the actions generated by the model (e.g., clicking a button) and capture screenshots.

In this tutorial, we use Playwright to start a browser environment for demonstration purpose.

In [None]:
# Start the Playwright session
playwright = await async_playwright().start()

# Launch the browser in headless mode, which is required for this environment
browser = await playwright.chromium.launch(headless=True)

# Create a new page
page = await browser.new_page()
screen_width, screen_height = 1920, 1080
await page.set_viewport_size({"width": screen_width, "height": screen_height})

print("Playwright session started.")

## A Single Turn: Step-by-Step Walkthrough

Now, let's walk through the code for a single turn of the agent loop, from sending the first request to preparing for the next one.

### **1. Send a Request to the Model**
First, you configure your API request. In the request, you add the Computer Use tool and send a prompt that includes the user's goal and an initial screenshot.

You can also include optional parameters like `excluded_predefined_functions` to prevent the model from using certain actions.

In [None]:
# Configure Computer Use tool with browser environment
config = GenerateContentConfig(
    tools=[
        Tool(
            computer_use=ComputerUse(
                environment=Environment.ENVIRONMENT_BROWSER,
                # Optional: Exclude specific predefined functions
                excluded_predefined_functions=["drag_and_drop"],
            )
        )
    ],
)

# Create the content with user message and initial screenshot
screenshot_bytes = await page.screenshot()

contents = [
    Content(
        role="user",
        parts=[
            Part(
                text="Find me a flight from SF to Hawaii on next Monday, coming back on next Friday. Start by navigating directly to flights.google.com"
            ),
            # Optional: include a screenshot of the initial state
            Part.from_bytes(
                data=screenshot_bytes,
                mime_type="image/png",
            ),
        ],
    )
]

# Generate content with the configured settings
response = client.models.generate_content(
    model=MODEL_ID,
    contents=contents,
    config=config,
)

print(response)

### **2. Receive the Model Response**
The model responds with one or more `FunctionCalls` that represent the UI actions it wants to perform. Let's inspect the response from our first API call.

In [None]:
response.function_calls

### **3. Execute the Received Actions**

Next, our application's client-side code needs to parse the response and execute the requested actions using Playwright. We'll use the `execute_function_calls` helper function for this.

The following example implements some most common UI actions. For a production use case, you would need to implement all supported actions unless you explicitly exclude them.

In [None]:
def normalize_x(x: int, screen_width: int) -> int:
    """Convert normalized x coordinate (0-1000) to actual pixel coordinate."""
    return int(x / 1000 * screen_width)


def normalize_y(y: int, screen_height: int) -> int:
    """Convert normalized y coordinate (0-1000) to actual pixel coordinate."""
    return int(y / 1000 * screen_height)


#


async def execute_function_calls(
    response, page, screen_width, screen_height
) -> List[Tuple[str, str]]:
    """
    Extracts and executes function calls from the model response.
    """
    candidate = response.candidates[0]
    function_calls = []
    thoughts = []
    for part in candidate.content.parts:
        if hasattr(part, "function_call") and part.function_call:
            function_calls.append(part.function_call)
        elif hasattr(part, "thought") and part.thought and hasattr(part, "text"):
            thoughts.append(part.text)

    if thoughts:
        print(f" Model Reasoning: {' '.join(thoughts)}")

    if not function_calls:
        return "NO_ACTION", []

    results = []
    for function_call in function_calls:
        result = None
        print(f"⚡ Executing Action: {function_call.name}")
        try:
            if function_call.name == "open_web_browser":
                result = "success"
            elif function_call.name == "navigate":
                await page.goto(function_call.args["url"])
                result = "success"
            elif function_call.name == "click_at":
                actual_x = normalize_x(function_call.args["x"], screen_width)
                actual_y = normalize_y(function_call.args["y"], screen_height)
                await page.mouse.click(actual_x, actual_y)
                result = "success"
            elif function_call.name == "type_text_at":
                actual_x = normalize_x(function_call.args["x"], screen_width)
                actual_y = normalize_y(function_call.args["y"], screen_height)
                await page.mouse.click(actual_x, actual_y)
                time.sleep(0.1)
                await page.keyboard.type(function_call.args["text"])
                if function_call.args.get("press_enter", False):
                    await page.keyboard.press("Enter")
                result = "success"
            else:
                result = "unknown_function"
        except Exception as e:
            print(f"❗️ Error executing {function_call.name}: {e}")
            result = f"error: {str(e)}"
        results.append((function_call.name, result))
    return "CONTINUE", results

In [None]:
results = await execute_function_calls(response, page, screen_width, screen_height)
print(results)

Here is an example action for navigating a URL. In this case, we create a simple mock response object.


In [None]:
mock_response = SimpleNamespace(
    candidates=[
        SimpleNamespace(
            content=SimpleNamespace(
                parts=[
                    SimpleNamespace(
                        function_call=FunctionCall(
                            name="navigate", args={"url": "https://flights.google.com"}
                        ),
                        thought=None,
                    )
                ]
            )
        )
    ]
)

In [None]:
print(f"Current page URL: {page.url}")

print("Calling execute_function_calls with a sample response")
results = await execute_function_calls(mock_response, page, screen_width, screen_height)
print(f"Results from execution:\n{results}\n")

print(f"Navigated to: {page.url}")

### **4. Capture the New State and Respond**

Finally, after executing the actions, we capture a new screenshot and the current URL. This state information is then formatted as a `FunctionResponse` and added to our conversation history, making it ready for the next turn in the loop.

In [None]:
status, action_results_list = results

function_response_parts = []
# for name, result in results:
for name, result in action_results_list:
    # After each action, capture a new screenshot and the current URL
    screenshot_bytes = await page.screenshot()
    current_url = page.url

    # Create a FunctionResponse for each action that was executed
    # This is required even if multiple actions were called in parallel
    function_response_parts.append(
        FunctionResponse(
            name=name,
            response={"url": current_url},
            parts=[
                Part(
                    inline_data=FunctionResponseBlob(
                        mime_type="image/png", data=screenshot_bytes
                    )
                )
            ],
        )
    )

# Package all the function responses into a single 'user' message
user_feedback_content = Content(role="user", parts=function_response_parts)

# Append this new message to your conversation history
contents.append(user_feedback_content)

print("Step 4 Complete: New state captured and added to conversation history.")

The `contents` list is now ready for the next call to the model.

#### Clean up by closing the browser and stopping the session


In [None]:
await browser.close()
await playwright.stop()

print("Browser closed and Playwright session stopped.")

## Build an Agent Loop

To enable multi-step interactions, combine the four steps from the How to implement Computer Use section into a loop. The loop must handle parallel function calls, thought summaries, and safety decisions. Remember to manage the conversation history correctly by appending both model responses and your function responses.

In [None]:
async def agent_loop(initial_prompt, max_turns=10):
    """Runs the main agent loop"""
    playwright_loop = await async_playwright().start()
    browser_loop = await playwright_loop.chromium.launch(headless=True)
    page_loop = await browser_loop.new_page()
    sw, sh = 1920, 1080
    await page_loop.set_viewport_size({"width": sw, "height": sh})

    print(f"Starting Agent Loop with prompt: '{initial_prompt}'")

    screenshot = await page_loop.screenshot()
    contents = [
        Content(
            role="user",
            parts=[
                Part(text=initial_prompt),
                Part.from_bytes(data=screenshot, mime_type="image/png"),
            ],
        )
    ]

    for turn in range(max_turns):
        print(f"\n Turn {turn + 1}")

        response = client.models.generate_content(
            model=MODEL_ID, contents=contents, config=config
        )

        # Handle cases where the model returns no candidates (e.g., due to safety filters)
        if not response.candidates:
            print("Model returned no candidates. This may be due to a safety filter.")
            print("Full Response:", response)
            print("Terminating loop.")
            break

        contents.append(response.candidates[0].content)

        final_text = "".join(
            part.text
            for part in response.candidates[0].content.parts
            if hasattr(part, "text") and not hasattr(part, "thought")
        )
        if final_text:
            print(f"Agent Finished: {final_text}")
            break

        status, execution_results = await execute_function_calls(
            response, page_loop, sw, sh
        )

        if status == "NO_ACTION":
            continue

        function_response_parts = []
        for name, result in execution_results:
            screenshot = await page_loop.screenshot()
            current_url = page_loop.url
            function_response_parts.append(
                FunctionResponse(
                    name=name,
                    response={"url": current_url},
                    parts=[
                        Part(
                            inline_data=FunctionResponseBlob(
                                mime_type="image/png", data=screenshot
                            )
                        )
                    ],
                )
            )
        contents.append(Content(role="user", parts=function_response_parts))
        print(f"State captured. History now has {len(contents)} messages.")

    print("\n Agent loop finished. Closing browser.")
    await browser_loop.close()
    await playwright_loop.stop()

In [None]:
# RUN THE AGENT LOOP
prompt = "Navigate to the Google Store and find the 'Pixel' category."
await agent_loop(prompt)