In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Imagen 3 Customized Avatar Images

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fvision%2Fuse-cases%2Fimagen3_customization_avatar.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/vision/use-cases/imagen3_customization_avatar.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/use-cases/imagen3_customization_avatar.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Katie Nguyen](https://github.com/katiemn) |

## Overview

### Imagen 3

Imagen 3 on Vertex AI brings Google's generative AI image customization capabilities to application developers. It's capable of modifying images to fit a certain style. Thus, developers have more control when building next-generation AI products that transform their imagination into high quality visual assets. Learn more about [Imagen on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/image/overview).

In this tutorial, you will learn how to use the Google Gen AI SDK for Python to generate customized avatar images using few-shot learning with Imagen 3. You'll complete image preprocessing steps, supply a text prompt and guide new image generation in the following styles:

- Watercolor
- Pencil sketch
- Marker illustration
- 3D cartoon

## Get started


### Install Google Gen AI SDK for Python & OpenCV


In [None]:
%pip install --upgrade --quiet google-genai opencv-python

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment.


In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [3]:
import base64
import io

from IPython.display import Markdown, display
from PIL import Image as PIL_Image
import cv2
from google import genai
import google.auth
import google.auth.transport.requests
from google.genai.types import (
    ControlReferenceConfig,
    ControlReferenceImage,
    EditImageConfig,
    Image,
    Part,
    SubjectReferenceConfig,
    SubjectReferenceImage,
)
import matplotlib.pyplot as plt
import requests

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [4]:
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Define helper functions

In [5]:
def send_request_to_google_api(api_endpoint, data=None):
    """
    Sends an HTTP request to a Google API endpoint.

    Args:
        api_endpoint: The URL of the Google API endpoint.
        data: (Optional) Dictionary of data to send in the request body (for POST, PUT, etc.).

    Returns:
        The response from the Google API.
    """

    # Get access token calling API
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    access_token = creds.token

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    response = requests.post(api_endpoint, headers=headers, json=data)
    response.raise_for_status()
    return response.json()


def get_resized_bytes(image_bytes, max_size):
    image = PIL_Image.open(io.BytesIO(image_bytes))
    buffered = io.BytesIO()
    image.thumbnail((max_size, max_size))
    image.save(buffered, "PNG")
    image.save("resized.png", "PNG")
    return buffered.getvalue()


def segment_image(request: dict):
    params = {"mode": "semantic", "confidenceThreshold": 0.5}
    req = {"instances": [request], "parameters": params}
    resp = send_request_to_google_api(segmentation_model, req)
    return resp["predictions"]


def crop_and_pad_face(image_path, output_path, bbox, padding_factor=0.5):
    image = cv2.imread(image_path)
    H, W, _ = image.shape
    y, x, h, w = bbox

    pad_x = int(w * padding_factor)
    pad_y = int(h * padding_factor)

    crop_x1 = max(0, x - pad_x)
    crop_y1 = max(0, y - pad_y)
    crop_x2 = min(W, x + w + pad_x)
    crop_y2 = min(H, y + h + pad_y)

    cropped_image = image[crop_y1:crop_y2, crop_x1:crop_x2]
    cv2.imwrite(output_path, cropped_image)


def display_images(generated_image, ref_image) -> None:
    fig, axis = plt.subplots(1, 2, figsize=(12, 6))
    axis[0].imshow(generated_image)
    axis[0].set_title("Modified Image")
    axis[1].imshow(ref_image)
    axis[1].set_title("Reference Image")
    for ax in axis:
        ax.axis("off")
    plt.show()

### Load the  models

Imagen 3 Customization: `imagen-3.0-capability-001`

Image Segmentation: `image-segmentation-001`

In [6]:
customization_model = "imagen-3.0-capability-001"
segmentation_model = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/us-central1/publishers/google/models/image-segmentation-001:predict"

## Image preprocessing

In order to get the best quality out of your customized images, segmenting, facial detection, and cropping will help improve overall results.

### Download an image

For this process you'll need to use an image with one person that doesn't have an occluded face. You can download one to use locally from a Google Cloud Storage bucket or locally upload one to session storage.

**Google Cloud Storage:** Switch out the URL in the `wget` command below to point to the URL of your image in a Google Cloud Storage bucket.

**Session storage:** Click on the file icon on the left hand side. Then, click on the 'Upload to session storage' button and select an image from your computer.

In [None]:
!wget https://storage.googleapis.com/cloud-samples-data/generative-ai/image/man-in-field.png

Enter the file name of the image you uploaded from Google Cloud Storage or your computer in the `IMAGE_FILE` variable below. By running the next cell you'll then rename your image file in Colab to be "face.png".

In [8]:
IMAGE_FILE = "man-in-field.png"  # @param {type: "string"}

os.rename(IMAGE_FILE, "face.png")

### Image segmentation & background replacement


> ⚠️ **Warning:** This process only works if the initial image has a single face that is not heavily occluded.

In this next cell, you'll send a request to the image segmentation model to create a mask around the person in the image so that they can be placed on a solid white background.

In [None]:
prompt = "person"
with open("face.png", "rb") as image_file:
    image_bytes = image_file.read()

new_bytes = get_resized_bytes(image_bytes, 640)
base64_encoded_bytes = base64.b64encode(new_bytes)

request = {"prompt": prompt, "image": {"bytesBase64Encoded": base64_encoded_bytes}}
mask = segment_image(request)
for m in mask:
    bytes_b64 = dict(m)["bytesBase64Encoded"]
    decoded_image_data = base64.b64decode(bytes_b64)
    image_stream = io.BytesIO(decoded_image_data)
    PIL_Image.open(image_stream).save("mask.png")

display_images(PIL_Image.open("mask.png"), PIL_Image.open("face.png"))

Now that you've segmented the person in the image, remove the background and replace it with a solid white backdrop.

In [None]:
base_image = PIL_Image.open("resized.png")
pil_mask = PIL_Image.open("mask.png")
color = (255, 255, 255, 255)

white_background = PIL_Image.new("RGBA", base_image.size, color)
white_background.paste(base_image, mask=pil_mask)

white_background.save("remove-background.png")
display(white_background)

### Facial detection

Run the following code to detect the face in the image using the pre-trained Haar Cascade classifier that is built into OpenCV. This cell will also draw a green bounding box around the detected face.

> ⚠️ **Warning:** If a face isn't detected, download a new photo and restart the image preprocessing steps. Otherwise, the rest of the tutorial will not work as expected.

In [None]:
img = cv2.imread("remove-background.png")
gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# load the pre-trained Haar Cascade classifier that is built into OpenCV
face_classifier = cv2.CascadeClassifier(
    cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)
face = face_classifier.detectMultiScale(
    gray_image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
)

for x, y, w, h in face:
    detected_face = [y, x, h, w]
    box_2d_list = [int(x) for x in detected_face]

for x, y, w, h in face:
    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 4)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(20, 10))
plt.imshow(img_rgb)
plt.axis("off")

### Crop the image

Once a face is detected, crop around the face using a helper function defined in the beginning of this tutorial.

In [None]:
crop_and_pad_face("remove-background.png", "cropped_image.png", box_2d_list)
display(PIL_Image.open("cropped_image.png"))

## Create avatar images

Now that you've preprocessed the initial image, it's time to customize the detected face in certain styles. Before you do this, you'll need to use Gemini to write a description of the person that can be used in later calls to Imagen.

### Use Gemini to generate a description

In [None]:
with open("cropped_image.png", "rb") as f:
    image = f.read()

response = client.models.generate_content(
    model="gemini-2.5-flash-preview-05-20",
    contents=[
        Part.from_bytes(data=image, mime_type="image/png"),
        "Briefly provide a description of the person's face in this image with 6 words max, no punctuation, all lowercase",
    ],
)

display(Markdown(response.text))

### Style 1: Watercolor

In [None]:
subject_image = Image.from_file(location="cropped_image.png")

subject_reference_image = SubjectReferenceImage(
    reference_id=1,
    reference_image=subject_image,
    config=SubjectReferenceConfig(
        subject_description=response.text, subject_type="SUBJECT_TYPE_PERSON"
    ),
)

control_reference_image = ControlReferenceImage(
    reference_id=2,
    reference_image=subject_image,
    config=ControlReferenceConfig(control_type="CONTROL_TYPE_FACE_MESH"),
)

prompt = f"Create a watercolor image of a {response.text} [1] in the pose of the control image [2] to match the description: A watercolor portrait of {response.text} [1] in a watercolor style, light and low-contrast color stokes, bright pastel color, a warm atmosphere, clean background with watercolor brushstrokes, bold dry brush, dry on dry technique, grainy textured paper, contrasting light and shadow, fine watery brush strokes, patchy details"

image = client.models.edit_image(
    model=customization_model,
    prompt=prompt,
    reference_images=[subject_reference_image, control_reference_image],
    config=EditImageConfig(
        edit_mode="EDIT_MODE_DEFAULT",
        number_of_images=1,
        safety_filter_level="BLOCK_MEDIUM_AND_ABOVE",
        person_generation="ALLOW_ADULT",
    ),
)

display_images(
    image.generated_images[0].image._pil_image, PIL_Image.open("resized.png")
)

### Style 2: Pencil sketch

In [None]:
subject_image = Image.from_file(location="cropped_image.png")

subject_reference_image = SubjectReferenceImage(
    reference_id=1,
    reference_image=subject_image,
    config=SubjectReferenceConfig(
        subject_description=response.text, subject_type="SUBJECT_TYPE_PERSON"
    ),
)

control_reference_image = ControlReferenceImage(
    reference_id=2,
    reference_image=subject_image,
    config=ControlReferenceConfig(control_type="CONTROL_TYPE_FACE_MESH"),
)

prompt = f"Create a pencil sketch of {response.text} [1] in the pose of the control image [2] to match the description: A simple pencil style sketch of a portrait of {response.text} with 6B and graphite pencils, white background, high quality, visible pencil lines, looking at the camera, natural human eyes"

image = client.models.edit_image(
    model=customization_model,
    prompt=prompt,
    reference_images=[subject_reference_image, control_reference_image],
    config=EditImageConfig(
        edit_mode="EDIT_MODE_DEFAULT",
        number_of_images=1,
        safety_filter_level="BLOCK_MEDIUM_AND_ABOVE",
        person_generation="ALLOW_ADULT",
    ),
)

display_images(
    image.generated_images[0].image._pil_image, PIL_Image.open("resized.png")
)

### Style 3: Illustration

In [None]:
subject_image = Image.from_file(location="cropped_image.png")

subject_reference_image = SubjectReferenceImage(
    reference_id=1,
    reference_image=subject_image,
    config=SubjectReferenceConfig(
        subject_description=response.text, subject_type="SUBJECT_TYPE_PERSON"
    ),
)

control_reference_image = ControlReferenceImage(
    reference_id=2,
    reference_image=subject_image,
    config=ControlReferenceConfig(control_type="CONTROL_TYPE_FACE_MESH"),
)

prompt = f"Create an illustration of {response.text} [1] in the pose of the control image [2] to match the description: a portrait of {response.text} [1] in a vector illustration style with bold outlines, pastel tones, clean and precise lines, organic wide brushstrokes in the background, soft aesthetics theme, blended color on the skin, dramatic lighting and contrasting highlights and shadows"

image = client.models.edit_image(
    model=customization_model,
    prompt=prompt,
    reference_images=[subject_reference_image, control_reference_image],
    config=EditImageConfig(
        edit_mode="EDIT_MODE_DEFAULT",
        number_of_images=1,
        safety_filter_level="BLOCK_MEDIUM_AND_ABOVE",
        person_generation="ALLOW_ADULT",
    ),
)

display_images(
    image.generated_images[0].image._pil_image, PIL_Image.open("resized.png")
)

### Style 4: 3D cartoon

In [None]:
subject_image = Image.from_file(location="cropped_image.png")

subject_reference_image = SubjectReferenceImage(
    reference_id=1,
    reference_image=subject_image,
    config=SubjectReferenceConfig(
        subject_description=response.text, subject_type="SUBJECT_TYPE_PERSON"
    ),
)

control_reference_image = ControlReferenceImage(
    reference_id=2,
    reference_image=subject_image,
    config=ControlReferenceConfig(control_type="CONTROL_TYPE_FACE_MESH"),
)

prompt = f"Create a 3D cartoon style image of {response.text} [1] in the pose of the control image [2] to match the description: a portrait of  {response.text} [1] in 3D cartoon style with a blurred background. A cute and lovely character, smiley face looking at the camera, pastel color tones, high quality, super details, skin texture, texture mapping, soft shadows, soft realistic lighting, vibrant colors"

image = client.models.edit_image(
    model=customization_model,
    prompt=prompt,
    reference_images=[subject_reference_image, control_reference_image],
    config=EditImageConfig(
        edit_mode="EDIT_MODE_DEFAULT",
        number_of_images=1,
        safety_filter_level="BLOCK_MEDIUM_AND_ABOVE",
        person_generation="ALLOW_ADULT",
    ),
)

display_images(
    image.generated_images[0].image._pil_image, PIL_Image.open("resized.png")
)