In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini 2.0 Flash Image Generation in Vertex AI with REST API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fgetting-started%2Fintro_gemini_2_0_image_gen_rest_api.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_image_gen_rest_api.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Nikita Namjoshi](https://github.com/nikitamaia) |
| [Katie Nguyen](https://github.com/katiemn) |

## Overview

Gemini 2.0 Flash supports image generation and editing. This enables you to converse with Gemini and create images with interwoven text.

In this tutorial, you'll learn how to use Gemini 2.0 Flash's image generation features in Vertex AI using the REST API.

You'll try out the following scenarios:
* Image generation:
  * Text to image
  * Text to image and text (interleaved)
* Image editing:
  * Text and image to image
  * Multi-turn image editing
  * Images and text to image and text (interleaved)


## Get started

### Install required libraries

In [1]:
%%capture

!sudo apt install -q jq

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment.

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [42]:
import base64
import json

from IPython.display import Image, Markdown, display

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [3]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "global"

### Load the image model

Gemini 2.0 Flash image generation: `gemini-2.0-flash-preview-image-generation`

In [4]:
MODEL_ID = "gemini-2.0-flash-preview-image-generation"

### Defining environment variables for cURL commands

These environment variables are used to construct the cURL commands.

In [5]:
import os

os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["LOCATION"] = LOCATION

API_HOST = "aiplatform.googleapis.com"
os.environ["API_ENDPOINT"] = (
    f"{API_HOST}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{MODEL_ID}"
)

## Image generation

First, send a text prompt to Gemini 2.0 Flash describing the image you want to generate.


### Text to image

In the curl command below, you'll see that the payload includes the following keys:

* `contents`: this is your prompt, in this case a text only user message
*`generation_config`: this dictionary specifies the desired output modalities, in this case `TEXT` and `IMAGE`. If you do not specify `IMAGE`, you will not get image output and `IMAGE` only is not allowed
* `safetySettings`: select your options from the categories below:
    * `method`: HARM_BLOCK_METHOD_UNSPECIFIED, SEVERITY, PROBABILITY
    * `category`: HARM_CATEGORY_UNSPECIFIED, HARM_CATEGORY_HATE_SPEECH, HARM_CATEGORY_DANGEROUS_CONTENT, HARM_CATEGORY_HARASSMENT, HARM_CATEGORY_SEXUALLY_EXPLICIT, HARM_CATEGORY_CIVIC_INTEGRITY
    * `threshold`: HARM_BLOCK_THRESHOLD_UNSPECIFIED, BLOCK_LOW_AND_ABOVE, BLOCK_MEDIUM_AND_ABOVE, BLOCK_ONLY_HIGH, BLOCK_NONE, OFF

The cell below writes the output of running the curl command to the file `response.json`.

In [38]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": {
      "role": "USER",
      "parts": { "text": "generate an image of a penguin driving a taxi in New York City"},
    },
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
    },
    "safetySettings": {
      "method": "PROBABILITY",
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  }' 2>/dev/null >response.json

Let's examine the output in the `response.json` file.

In `content` you can see the model has created an `image/png` which is the b64 encoded value to the `data` key.

In [39]:
!cat response.json

Next, load in the data from the `response.json` file so it's easier to work with in Python.

In [40]:
with open("response.json") as f:
    response_data = json.load(f)
    print(response_data)

Extract the image data from the response and visualize. All generated images include a [SynthID watermark](https://deepmind.google/technologies/synthid/), which can be verified via the Media Studio in [Vertex AI Studio](https://cloud.google.com/generative-ai-studio?hl=en).

In [41]:
image_part = next(
    filter(
        lambda x: "inlineData" in x,
        response_data["candidates"][0]["content"]["parts"],
    )
)

image_data = base64.b64decode(image_part["inlineData"]["data"])
display(Image(data=image_data, width=350, height=350))

### Text to image and text

In addition to generating images, Gemini can generate multiple images and text in an interleaved fashion.

For example, you could ask the model to generate a recipe for banana bread with images showing different stages of the cooking process. Or, you could ask the model to generate images of different wildflowers with accompanying titles and descriptions.

Let's try out the interleaved text and image functionality by prompting Gemini 2.0 Flash to create a tutorial for assemblying a peanut butter and jelly sandwich.

You'll notice that in the prompt we ask the model to generate both text and images. This will nudge the model to create text with images interleaved.

⚠️ **Note:** we are asking the model to generate a lot of content in this prompt, so it will take a bit of time for this cell to finish executing.

In [22]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": {
      "role": "USER",
      "parts": { "text": "Create a tutorial explaining how to make a peanut butter and jelly sandwich in three easy steps. For each step, provide a title with the number of the step, an explanation, and also generate an image, generate each image in a 1:1 aspect ratio."},
    },
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
     },
     "safetySettings": {
      "method": "PROBABILITY",
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  }' 2>/dev/null >response.json

Let's visualize the response.

In [None]:
with open("response.json") as f:
    response_data = json.load(f)

for part in response_data["candidates"][0]["content"]["parts"]:
    if "text" in part.keys():
        display(Markdown(part["text"]))
    if "inlineData" in part.keys():
        content = part["inlineData"]["data"]
        image_data = base64.b64decode(content)
        display(Image(data=image_data, width=350, height=350))

## Image editing

You can pass text and an image to Gemini 2.0 Flash for use cases like product captions, information about a particular image, or to make edits or modifications to an existing image.

### Text and image to image

Let's try out a style transfer example and ask Gemini 2.0 Flash to create an image of this dog in a 3D cartoon rendering.

Visualize the starting dog image by running this next cell.

In [None]:
image_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/dog-1.jpg"
)
display(Image(url=image_url, width=350, height=350))

In [53]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": {
      "role": "USER",
      "parts": [
        {"file_data": {
          "mime_type": "image/jpg",
          "file_uri": "gs://cloud-samples-data/generative-ai/image/dog-1.jpg"
          }
        },
        {"text": "Create a 3D cartoon style portrait of this dog, include rounded, exaggerated facial features, saturated colors, and realistic-looking textures. The dog is wearing a cowboy hat."},
      ]

    },
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
    },
    "safetySettings": {
      "method": "PROBABILITY",
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  }' 2>/dev/null >response.json

Extract the image data from the response and visualize.

In [None]:
with open("response.json") as f:
    response_data = json.load(f)

image_part = next(
    filter(
        lambda x: "inlineData" in x,
        response_data["candidates"][0]["content"]["parts"],
    )
)

image_data = base64.b64decode(image_part["inlineData"]["data"])
display(Image(data=image_data, width=350, height=350))

### Multi-turn image editing

In this next section, you supply a starting image and iteratively alter certain aspects of the image though chatting with Gemini 2.0 Flash.

Visualize the starting image of a vase that's stored in Google Cloud Storage by running this next cell.

In [None]:
image_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/vase.png"
)
display(Image(url=image_url, width=350, height=350))

In [56]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": {
      "role": "USER",
      "parts": [
        {"file_data": {
          "mime_type": "image/png",
          "file_uri": "gs://cloud-samples-data/generative-ai/image/vase.png"
          }
        },
        {"text": "add sunflowers to this vase"},
      ]
    },
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
    }
  }' 2>/dev/null >response.json

Extract the image data from the response and visualize.


In [None]:
with open("response.json") as f:
    response_data = json.load(f)

image_part = next(
    filter(
        lambda x: "inlineData" in x,
        response_data["candidates"][0]["content"]["parts"],
    )
)

image_data = base64.b64decode(image_part["inlineData"]["data"])
display(Image(data=image_data, width=350, height=350))

Now, you'll add to the `contents` of the last request by including another `user` text prompt.

In [58]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": [
    {
      "role": "user",
      "parts": [
        {"file_data": {
          "mime_type": "image/png",
          "file_uri": "gs://cloud-samples-data/generative-ai/image/vase.png"
          }
        },
        {"text": "add sunflowers to this vase"},
      ]
    },
    {
      "role": "user",
      "parts": [
        { "text": "replace the sunflowers in the vase with pink and purple tulips" },
      ],
    },
    ],
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
    },
  }' 2>/dev/null >response.json

Extract the image data from the response and visualize.


In [None]:
with open("response.json") as f:
    response_data = json.load(f)

image_part = next(
    filter(
        lambda x: "inlineData" in x,
        response_data["candidates"][0]["content"]["parts"],
    )
)

image_data = base64.b64decode(image_part["inlineData"]["data"])
display(Image(data=image_data, width=350, height=350))

### Images and text to image and text

When editing images with Gemini 2.0 Flash, you can also supply multiple input images to create new ones. In this next example, you'll prompt Gemini with an image of a teacup and an outdoor table. You'll then ask Gemini to combine the objects from these images in order to create a new one. You'll also ask Gemini to supply text to accompany the image.

Visualize the starting images by running this next cell.

In [None]:
table_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/table.png"
)
display(Image(url=table_url, width=300, height=300))

teacup_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/teacup-1.png"
)
display(Image(url=teacup_url, width=300, height=300))

In [61]:
%%bash

curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  https://${API_ENDPOINT}:generateContent \
  -d '{
    "contents": {
      "role": "USER",
      "parts": [
      { "text": "Generate a side profile image of a person sitting at this table drinking out of this teacup in a 1:1 aspect ratio. Include a caption that could be used to post this image on social media."},
      {"file_data": {
          "mime_type": "image/png",
          "file_uri": "gs://cloud-samples-data/generative-ai/image/table.png"
          }
        },
        {"file_data": {
          "mime_type": "image/png",
          "file_uri": "gs://cloud-samples-data/generative-ai/image/teacup-1.png"
          }
        },]
    },
    "generation_config": {
      "response_modalities": ["TEXT", "IMAGE"],
     },
     "safetySettings": {
      "method": "PROBABILITY",
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
  }' 2>/dev/null >response.json

Extract the text and image data from the response and visualize.


In [None]:
with open("response.json") as f:
    response_data = json.load(f)

for part in response_data["candidates"][0]["content"]["parts"]:
    if "text" in part.keys():
        display(Markdown(part["text"]))
    if "inlineData" in part.keys():
        content = part["inlineData"]["data"]
        image_data = base64.b64decode(content)
        display(Image(data=image_data, width=350, height=350))