In [None]:
# Copyright 2026 MongoDB, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Voyage 4 Embedding Models

This notebook demonstrates how to deploy and use the Voyage 4 family of embedding models, featuring an **industry-first shared embedding space** that allows you to mix and match models for optimal cost and performance.

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/voyage-4.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fgenerative_ai%2Fvoyage-4.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/voyage-4.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/voyage-4.ipynb">
      <img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" alt="GitHub logo" width="32px"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

The **Voyage 4** family introduces an **industry-first shared embedding space** across all model sizes. This means embeddings from any Voyage 4 model are **interchangeable**—you can encode documents with one model and queries with another, enabling optimal cost-performance trade-offs.

### Key Features

* **Shared Embedding Space**: All Voyage 4 models (large, standard, lite) produce compatible embeddings, so you can mix models for documents vs. queries
* **Matryoshka Representation Learning (MRL)**: Variable-dimension embeddings (256, 512, 1024, 2048) from the same model
* **Quantization-Aware Training (QAT)**: Optimized for int8, uint8, binary, and ubinary formats with minimal quality loss
* **Maximum 32K tokens input**: Support for long documents

### Model Family

| Model | Description | Best For |
| :--- | :--- | :--- |
| **voyage-4-large** | State-of-the-art general-purpose and multilingual embedding optimized for retrieval quality | Document embeddings where quality matters most |
| **voyage-4** | General-purpose multilingual embedding model optimized for retrieval/search and AI applications | Balanced cost/quality trade-off |
| **voyage-4-lite** | Lightweight general-purpose embedding model optimized for low latency and cost | Query embeddings and cost-sensitive applications |

### What you'll learn

In this notebook, you will:

* Deploy a Voyage 4 model to a Vertex AI endpoint
* Generate embeddings and perform semantic similarity
* Explore advanced parameters (dimensions, quantization)

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI Model Garden
* Vertex AI Prediction endpoints

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform numpy

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# @title Setup Google Cloud project

# Set your Google Cloud project ID and region below:

import os

import vertexai

# @markdown Enter your project ID if not auto-detected:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")

# @markdown Select your region:
LOCATION = "us-central1"  # @param ["us-central1", "us-east1", "us-west1", "europe-west1", "europe-west4", "asia-east1", "asia-southeast1"]

print(f"Project ID: {PROJECT_ID}")
print(f"Location: {LOCATION}")

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Deploy model

The Voyage 4 family features a **shared embedding space**, meaning embeddings from any Voyage 4 model (large, standard, lite) are interchangeable. This allows you to use different models for documents and queries while maintaining compatibility.

For this notebook, we'll deploy a single endpoint. The three models are:

* **voyage-4-large** — State-of-the-art retrieval quality, ideal for document embeddings
* **voyage-4** — Balanced for retrieval/search and AI applications
* **voyage-4-lite** — Optimized for low latency and cost, ideal for query embeddings

### Initialize the Model

Initialize the Voyage 4 model from Model Garden.

Use the `list_deploy_options()` method to view the verified deployment configurations for your selected model.

In [None]:
from vertexai import model_garden

# @title Select Model
# @markdown Choose the Voyage 4 model to deploy:
MODEL = "voyage-4"  # @param ["voyage-4-large", "voyage-4", "voyage-4-lite"]

# Default to voyage-4 if not set
if not MODEL:
    MODEL = "voyage-4"

MODEL_NAME = f"mongodb/{MODEL}@latest"
model = model_garden.OpenModel(MODEL_NAME)

# Set accelerator based on model (voyage-4-large requires 80GB GPU)
if MODEL == "voyage-4-large":
    MACHINE_TYPE = "a2-ultragpu-1g"
    ACCELERATOR_TYPE = "NVIDIA_A100_80GB"
else:
    MACHINE_TYPE = "a2-highgpu-1g"
    ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"

print(f"Selected model: {MODEL_NAME}")
print(f"Accelerator: {ACCELERATOR_TYPE} on {MACHINE_TYPE}")
deploy_options = model.list_deploy_options(concise=True)
print(deploy_options)

In [None]:
# @title Deploy or connect to endpoint
# @markdown Choose whether to deploy a new model or use an existing endpoint:

deployment_option = "deploy_new"  # @param ["deploy_new", "use_existing"]

# @markdown ---
# @markdown If using existing endpoint, provide the endpoint ID:
ENDPOINT_ID = ""  # @param {type:"string"}

if deployment_option == "deploy_new":
    print(f"Deploying {MODEL}...")
    print(f"Using {ACCELERATOR_TYPE} on {MACHINE_TYPE}")
    endpoint = model.deploy(
        machine_type=MACHINE_TYPE,
        accelerator_type=ACCELERATOR_TYPE,
        accelerator_count=1,
        accept_eula=True,
        use_dedicated_endpoint=True,
    )
    print(f"Endpoint deployed: {endpoint.display_name}")
    print(f"Endpoint resource name: {endpoint.resource_name}")
else:
    if not ENDPOINT_ID:
        raise ValueError("Please provide an ENDPOINT_ID when using existing endpoint")

    from google.cloud import aiplatform

    print(f"Connecting to existing endpoint: {ENDPOINT_ID}")
    endpoint = aiplatform.Endpoint(
        endpoint_name=f"projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}"
    )
    print(f"Using endpoint: {endpoint.display_name}")
    print(f"Endpoint resource name: {endpoint.resource_name}")

## Generate embeddings

Now let's look at basic embedding generation with the Voyage 4 models.

In [None]:
import json

# Multiple texts to embed
texts = [
    "Machine learning enables computers to learn from data.",
    "Natural language processing helps computers understand human language.",
    "Computer vision allows machines to interpret visual information.",
    "Deep learning uses neural networks with multiple layers.",
]

# Prepare the batch request and make invoke call
body = {"input": texts, "output_dimension": 1024, "input_type": "document"}
response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)

# Extract embeddings
result = response.json()
embeddings = [item["embedding"] for item in result["data"]]

print(f"Number of texts embedded: {len(embeddings)}")
print(f"Embedding dimension: {len(embeddings[0])}")
print(f"\nFirst embedding (first 5 values): {embeddings[0][:5]}")
print(f"Second embedding (first 5 values): {embeddings[1][:5]}")

### Semantic similarity

Use embeddings to compute semantic similarity between text:

In [None]:
import json

import numpy as np


def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


# Example texts
query = "How do computers learn from examples?"
documents = [
    "Machine learning enables computers to learn from data.",
    "The weather today is sunny and warm.",
    "Neural networks can recognize patterns in data.",
    "I enjoy cooking Italian food.",
]

# Get embeddings - using invoke with /embeddings endpoint
all_texts = [query] + documents
body = {"input": all_texts, "output_dimension": 1024, "input_type": "document"}
response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
result = response.json()
all_embeddings = [item["embedding"] for item in result["data"]]

query_embedding = all_embeddings[0]
doc_embeddings = all_embeddings[1:]

# Calculate similarities
print(f"Query: {query}\n")
print("Similarity scores:")
for i, doc in enumerate(documents):
    similarity = cosine_similarity(query_embedding, doc_embeddings[i])
    print(f"{similarity:.4f} - {doc}")

## Advanced parameters

Let's explore the advanced parameters that Voyage 4 models support to optimize your embeddings.

### Understanding input_type: Query vs Document

The `input_type` parameter optimizes embeddings for retrieval tasks:

* **`query`**: Use this when the text represents a search query or question. The model prepends "Represent the query for retrieving supporting documents: " to optimize for retrieval.
* **`document`**: Use this when the text represents a document or passage to be searched. The model prepends "Represent the document for retrieval: " to optimize for indexing.
* **`null`** (default): No special prompt is added. Use for general-purpose embeddings.

**Best Practice**: For retrieval/search applications, use `input_type="query"` for your search queries and `input_type="document"` for the documents you're indexing.

In [None]:
import json

# Example: Using input_type for retrieval
query_text = "What is machine learning?"
document_texts = [
    "Machine learning enables computers to learn from data.",
    "Natural language processing helps computers understand human language.",
    "Computer vision allows machines to interpret visual information.",
]

# Generate query embedding with input_type="query"
query_body = {
    "input": [query_text],
    "output_dimension": 1024,
    "input_type": "query",  # Optimize for search queries
}
query_response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(query_body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
query_result = query_response.json()
query_embedding = query_result["data"][0]["embedding"]

# Generate document embeddings with input_type="document"
doc_body = {
    "input": document_texts,
    "output_dimension": 1024,
    "input_type": "document",  # Optimize for document indexing
}
doc_response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(doc_body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
doc_result = doc_response.json()
doc_embeddings = [item["embedding"] for item in doc_result["data"]]

print(f"Query: {query_text}")
print(f"Query embedding dimension: {len(query_embedding)}")
print(f"\nNumber of documents embedded: {len(doc_embeddings)}")
print(f"Document embedding dimension: {len(doc_embeddings[0])}")
print(f"\nQuery embedding (first 5 values): {query_embedding[:5]}")
print(f"First document embedding (first 5 values): {doc_embeddings[0][:5]}")

### Using different output dimensions (Matryoshka Representation Learning)

Voyage 4 models support **Matryoshka Representation Learning (MRL)**, providing variable-dimension embeddings: 256, 512, 1024 (default), and 2048. Smaller dimensions reduce storage and computation costs, while larger dimensions may provide better accuracy.

In [None]:
import json

text = "Machine learning enables computers to learn from data."

# Test different output dimensions
dimensions = [256, 512, 1024, 2048]

print("Comparing different output dimensions (MRL):\n")
for dim in dimensions:
    body = {"input": [text], "output_dimension": dim, "input_type": "document"}
    response = endpoint.invoke(
        request_path="/embeddings",
        body=json.dumps(body).encode("utf-8"),
        headers={"Content-Type": "application/json"},
    )
    result = response.json()
    embedding = result["data"][0]["embedding"]

    print(f"Dimension {dim}:")
    print(f"  Length: {len(embedding)}")
    print(f"  First 5 values: {embedding[:5]}")
    print(f"  Storage size: ~{len(embedding) * 4} bytes (float32)\n")

### Using different output data types (Quantization-Aware Training)

Voyage 4 models support **Quantization-Aware Training (QAT)**, optimizing embeddings for multiple output data types:

* **`float`** (default): 32-bit floating-point numbers, highest precision
* **`int8`**: 8-bit signed integers (-128 to 127), 4x smaller than float
* **`uint8`**: 8-bit unsigned integers (0 to 255), 4x smaller than float
* **`binary`**: Bit-packed signed integers (int8), 32x smaller than float
* **`ubinary`**: Bit-packed unsigned integers (uint8), 32x smaller than float

Quantized formats trade some precision for significant storage savings, with minimal quality loss thanks to QAT.

In [None]:
import json

text = "Machine learning enables computers to learn from data."

# Test different output data types
output_dtypes = ["float", "int8", "uint8", "binary", "ubinary"]
output_dimension = 1024

print("Comparing different output data types (QAT):\n")
for dtype in output_dtypes:
    body = {
        "input": [text],
        "output_dimension": output_dimension,
        "output_dtype": dtype,
        "input_type": "document",
    }
    response = endpoint.invoke(
        request_path="/embeddings",
        body=json.dumps(body).encode("utf-8"),
        headers={"Content-Type": "application/json"},
    )
    result = response.json()
    embedding = result["data"][0]["embedding"]

    # Calculate actual storage size
    if dtype == "float":
        storage_bytes = len(embedding) * 4  # 4 bytes per float32
    elif dtype in ["int8", "uint8"]:
        storage_bytes = len(embedding) * 1  # 1 byte per int8/uint8
    elif dtype in ["binary", "ubinary"]:
        storage_bytes = len(embedding) * 1  # bit-packed, 1/8 of dimension

    print(f"Output dtype: {dtype}")
    print(f"  Length: {len(embedding)}")
    print(f"  Value type: {type(embedding[0]).__name__}")
    print(f"  First 5 values: {embedding[:5]}")
    print(f"  Storage size: ~{storage_bytes} bytes")

    # Calculate compression ratio vs float
    if dtype != "float":
        compression_ratio = (output_dimension * 4) / storage_bytes
        print(f"  Compression: {compression_ratio:.1f}x smaller than float")
    print()

### Combining output_dimension and output_dtype

You can combine different dimensions and data types to optimize for your use case.

Please refer to our guide for details on [offset binary](https://docs.voyageai.com/docs/flexible-dimensions-and-quantization#offset-binary) and [binary embeddings](https://docs.voyageai.com/docs/flexible-dimensions-and-quantization#quantization). 

In [None]:
import json

text = "Machine learning enables computers to learn from data."

# Example: Ultra-compact embeddings (256 dimensions + ubinary)
compact_body = {
    "input": [text],
    "output_dimension": 256,
    "output_dtype": "ubinary",  # Most compact format
    "input_type": "document",
}
compact_response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(compact_body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
compact_result = compact_response.json()
compact_embedding = compact_result["data"][0]["embedding"]

# Example: High-precision embeddings (2048 dimensions + float)
precise_body = {
    "input": [text],
    "output_dimension": 2048,
    "output_dtype": "float",  # Highest precision
    "input_type": "document",
}
precise_response = endpoint.invoke(
    request_path="/embeddings",
    body=json.dumps(precise_body).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
precise_result = precise_response.json()
precise_embedding = precise_result["data"][0]["embedding"]

# Compare storage requirements
compact_storage = len(compact_embedding) * 1  # binary is bit-packed
precise_storage = len(precise_embedding) * 4  # float32

print("Storage comparison:\n")
print("Ultra-compact (256-dim ubinary):")
print("  Dimension: 256")
print(f"  Storage: ~{compact_storage} bytes")
print(f"  First 5 values: {compact_embedding[:5]}\n")

print("High-precision (2048-dim float):")
print(f"  Dimension: {len(precise_embedding)}")
print(f"  Storage: ~{precise_storage} bytes")
print(f"  First 5 values: {precise_embedding[:5]}\n")

print(f"Storage ratio: {precise_storage / compact_storage:.1f}x")
print("\nFor 1 million vectors:")
print(f"  Ultra-compact: ~{compact_storage * 1_000_000 / (1024**2):.1f} MB")
print(f"  High-precision: ~{precise_storage * 1_000_000 / (1024**2):.1f} MB")

## Cleaning up

To avoid incurring charges to your Google Cloud account for the resources used in this tutorial, delete the endpoint.

In [None]:
# Delete the endpoint (this will also undeploy all models)
print(f"Deleting endpoint: {endpoint.display_name}")
endpoint.delete(force=True)
print("Endpoint deleted successfully!")