In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Started with Mistral AI OCR
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/mistralai_ocr.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fgenerative_ai%2Fmistralai_ocr.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">                                                                             
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/mistralai_ocr.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/mistralai_ocr.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  
</table>

NOTE: This notebook has been tested in the following environment:

Python version = 3.10

## Objective

Mistral OCR (25.05) is a model specialized in extracting text and images from documents. It is specifically built to preserve the structure of the document pages and automatically formats the extracted text in Markdown.

The objective of this notebook is to provide a summary overview of the Mistral OCR (25.05) model's capabilities and how to leverage it using the Vertex AI platform.

In [None]:
%pip install -U -q httpx

## Getting started

Before proceeding further, fill in the following information:

In [1]:
PROJECT_ID = ""
REGION = ""
MODEL_NAME = "mistral-ocr"
MODEL_VERSION = "2505"
TEST_DOC_URL = "https://arxiv.org/pdf/2410.07073"

As a developer, your first step is to authenticate your notebook environment. If you are using Google Colab to run it, the following cell should take care of the authentication, otherwise it will run the `gcloud` command to retrieve the access token needed to authenticate your API calls:

In [None]:
import subprocess
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()
else:
    try:
        result = subprocess.run(
            ["gcloud", "auth", "print-access-token"],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        access_token = result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error while running command: {e.stderr}")

## Calling the OCR model (HTTP)

Start by defining a simple function that will help building the URL of your model endpoint:

In [None]:
def build_endpoint_url(
    region: str, project_id: str, model_name: str, model_version: str
) -> str:
    base_url = f"https://{region}-aiplatform.googleapis.com/v1"
    endpoint_url_segments = [
        base_url,
        f"projects/{project_id}",
        f"locations/{region}",
        "publishers/mistralai",
        f"models/{model_name}-{model_version}",
    ]
    specifier = "rawPredict"  # Streaming is not supported
    endpoint_url = "/".join(endpoint_url_segments) + f":{specifier}"
    return endpoint_url

You can now send your HTTP request to the model endpoint:

Calling the OCR model is done via a HTTP POST request where the document to be processed is passed in the payload as a base64-encoded string. The following cell defines another helper function that downloads a given PDF file from a URL and encodes it in base64. If you already have your own documents at hand you can easily modify it to only keep the encoding part.

**Warning**: the larger the document, the bigger the payload and the longer the model will take to handle it. To avoid timeout issues it is advised to split the document into smaller chunks. The number and size of chunks will depend on the total volume of documents you wish to process.

In [None]:
import base64


def download_pdf_and_base64_encode(pdf_url: str) -> str:
    resp = httpx.get(pdf_url)
    resp.raise_for_status()
    content_bytes = resp.content
    content_encoded_pdf = base64.b64encode(content_bytes).decode("utf-8")
    return content_encoded_pdf

You can now send the HTTP request to the model endpoint. Note that you can also optionally:

- limit the number of scanned pages,
- retrieved any image detected by the model, in the form of base64-encoded strings.

Check the detailed list of available options and values in the [Mistral AI API documentation](https://docs.mistral.ai/api/#tag/ocr).

In [None]:
import httpx

# URL
url = build_endpoint_url(
    region=REGION,
    project_id=PROJECT_ID,
    model_name=MODEL_NAME,
    model_version=MODEL_VERSION,
)

# Headers
headers = {"Content-Type": "application/json", "Accept": "application/json"}
if "google.colab" not in sys.modules:
    headers["Authorization"] = f"Bearer {access_token}"

# Payload
encoded_doc = download_pdf_and_base64_encode(TEST_DOC_URL)
payload = {
    "model": f"{MODEL_NAME}-{MODEL_VERSION}",
    "document": {
        "type": "document_url",
        "document_url": f"data:application/pdf;base64,{encoded_doc}",
    },
}

# Request
model_resp = httpx.post(url=url, headers=headers, json=payload, timeout=3600)

# Response
model_resp.raise_for_status()
if model_resp.status_code == 200:
    scanned_doc = model_resp.json()

## Parsing the results

If your request was successful, the `scanned_doc` variable should contain:

* `.pages` : a list of dicts containing, for each scanned page, the Markdown-formatted text detected.
* `.usage_info`: the total count of pages processed as well as the scanned document's size.

In [None]:
# Beginning of the first page's content

content_extract = scanned_doc["pages"][0]["markdown"][:256]
print(content_extract)

In [None]:
content_info = scanned_doc["usage_info"]
print(content_info)

## (Advanced) Combining OCR with a multimodal model

In more elaborate scenarios, you may want to annotate the images of a document in addition to retrieving the document's text. This is made possible by adding a multimodal model such as `mistral-small-2503` to the mix and have it analyze the image extracted from the OCR operation, making the overall operation a two-step process which takes longer but yields more information on the document's content.

To make the code more modular you can start by packaging the OCR call into a `call_ocr_model()` function that:
- downloads the document,
- converts it into a base64-encoded string,
- passes that string to the OCR model. We added a `pages` argument to optionally limit the number of pages processed, to shorten processing time.

In [None]:
from typing import Any, Dict, Optional


def call_ocr_model(
    endpoint_url: str,
    pdf_url: str,
    with_image_outputs: bool = False,
    pages: Optional[str] = None,
    access_token: Optional[str] = None,
) -> Dict[str, Any]:
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    if access_token:
        headers["Authorization"] = f"Bearer {access_token}"
    encoded_doc = download_pdf_and_base64_encode(pdf_url)
    payload = {
        "model": f"{MODEL_NAME}-{MODEL_VERSION}",
        "document": {
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{encoded_doc}",
        },
        "include_image_base64": with_image_outputs,
    }
    if pages:
        payload["pages"] = pages
    with httpx.Client() as client:
        ocr_resp = client.post(
            url=endpoint_url, headers=headers, json=payload, timeout=3600
        )
        ocr_resp.raise_for_status()
        if ocr_resp.status_code == 200:
            return ocr_resp.json()

To illustrate how the system works, you can scan the document pointed at by the `TEST_DOC_URL` URL as such:

In [None]:
scanned_doc = call_ocr_model(
    endpoint_url=url,
    pdf_url=TEST_DOC_URL,
    pages="1-10",
    with_image_outputs=True,
    access_token=access_token,
)

Note that `with_image_outputs` is set to `True` because you will want to annotate the figures/images in the documents, so you need to retrieve their base64-encoded representation.

The next step is to define how the multimodal model will be called to analyze the image content. To do so, you will query another Mistral model available on Vertex AI: `mistral-small-2503`, which can process both text and image inputs. This is a basic system message you can pass to it:

In [None]:
VLM_NAME = "mistral-small"
VLM_VERSION = "2503"

ANNOTATION_SYSTEM_PROMPT = """
Your mission is to provide a clear description to each image you will see.
Describe its features and key components.
Return your answer in a well-structured JSON object.
"""

To ensure that the image annotation will stick to a specific format, you will leverage another feature called _structured outputs_, which enforces strict schema rules when you require JSON output from the model. In practice, you can define your output structure with a Pydantic model, then later convert it into a JSON Schema dictionary when passing it to the API, here is an example:

In [None]:
import json

from pydantic import BaseModel, Field


class AnnotatedImage(BaseModel):
    short_desc: str = Field(
        ..., description="A short one-sentence summary of the image"
    )
    long_desc: str = Field(
        ..., description="A longer detailed description of the image"
    )


schema = AnnotatedImage.model_json_schema()
print(json.dumps(schema, indent=4))

You can read more about structured outputs in the [Mistral documentation](https://docs.mistral.ai/capabilities/structured-output/custom_structured_output).

From there, you can write an `annotate_with_vlm_model()` function that will call the multimodal model and pass it a base64-encoded image to retrieve its description in a well-structured format:

In [None]:
def annotate_with_vlm_model(
    endpoint_url: str,
    annotation_structure: BaseModel,
    image_base64: str,
    access_token: Optional[str] = None,
    debug: bool = False,
) -> Dict[str, Any]:

    # Headers
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    if access_token:  # Non-Colab environments only
        headers["Authorization"] = f"Bearer {access_token}"

    # JSON output schema
    annotation_schema = annotation_structure.model_json_schema()
    annotation_schema["additionalProperties"] = False
    payload = {
        "model": f"{VLM_NAME}-{VLM_VERSION}",
        "messages": [
            {"role": "system", "content": ANNOTATION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [{"type": "image_url", "image_url": image_base64}],
            },
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "schema": annotation_schema,
                "name": "image_schema",
                "strict": True,
            },
        },
    }

    # Request & response
    with httpx.Client() as client:
        vlm_resp = client.post(
            url=endpoint_url, headers=headers, json=payload, timeout=3600
        )
        vlm_resp.raise_for_status()
        if vlm_resp.status_code == 200:
            vlm_out = vlm_resp.json()
            annotation = json.loads(vlm_out["choices"][0]["message"]["content"])
        return annotation

You can test your function on a page of the scanned document that contains one or more images:

In [None]:
from typing import List

page_idx = 0
images_b64 = [item["image_base64"] for item in scanned_doc["pages"][page_idx]["images"]]
vlm_url = build_endpoint_url(
    model_name=VLM_NAME, model_version=VLM_VERSION, project_id=PROJECT_ID, region=REGION
)
annotations: List[Dict[str, Any]] = []
for imgb64 in images_b64:
    annotations.append(
        annotate_with_vlm_model(
            endpoint_url=vlm_url,
            annotation_structure=AnnotatedImage,
            image_base64=imgb64,
            access_token=access_token,
        )
    )
print(f"Page {page_idx}:")
print(annotations)

Finally, in order to stict everything together, you can run the following code that will edit in-place the `scanned_doc` variable and add a `annotations` field where each detected image will have its description written:

In [None]:
for idx, page in enumerate(scanned_doc["pages"]):
    annotations: List[Dict[str, Any]] = []
    for img in page["images"]:
        annotations.append(
            {
                "id": img["id"],
                "annotation": annotate_with_vlm_model(
                    endpoint_url=vlm_url,
                    annotation_structure=AnnotatedImage,
                    image_base64=img["image_base64"],
                    access_token=access_token,
                ),
            }
        )
    scanned_doc["pages"][idx]["annotations"] = annotations

You can now retrieve both text and image descriptions:

In [None]:
page_idx = 6
text = scanned_doc["pages"][page_idx]["markdown"]
annotations = scanned_doc["pages"][page_idx]["annotations"]
print(text)
print(80 * "_")
print(annotations)