In [None]:
%pip install --upgrade google-cloud-documentai google-cloud-documentai-toolbox pandas --user

In [None]:
!gcloud auth application-default login

In [6]:
from typing import List, Optional

# https://googleapis.dev/python/google-api-core/latest/client_options.html
from google.api_core.client_options import ClientOptions

# https://cloud.google.com/python/docs/reference/documentai/latest
from google.cloud import documentai

# https://cloud.google.com/document-ai/docs/toolbox
from google.cloud import documentai_toolbox

import pandas as pd
from tabulate import tabulate

In [None]:
# TODO(developer): Fill these variables before running the sample.
project_id = "YOUR_PROJECT_ID"
location = "us"  # Format is "us" or "eu"
processor_id = "YOUR_PROCESSOR_ID"  # Create processor before running sample
processor_version_id = "YOUR_PROCESSOR_VERSION_ID"

gcs_input_uri = "YOUR_INPUT_BUCKET"  # Format: `gs://bucket/directory/`
gcs_output_uri = "YOUR_OUTPUT_BUCKET"  # Must end with a trailing slash `/`. Format: `gs://bucket/directory/subdirectory/`

batch_size = 1000
field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.

## Batch Processing

- Create batches of 1000 documents in Google Cloud Storage.
- Make a batch processing request for each batch.
- Get long-running operation ID for each request.

In [5]:
def batch_process_toolbox(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    batch_size: int,
    field_mask: Optional[str] = None,
    skip_human_review: bool = True,
) -> List[str]:
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version_id
    )

    # Cloud Storage URI for the Output Directory
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
            gcs_uri=gcs_output_uri, field_mask=field_mask
        )
    )

    # Create batches of documents for processing
    # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.utilities.gcs_utilities
    gcs_bucket_name, gcs_prefix = documentai_toolbox.gcs_utilities.split_gcs_uri(
        gcs_input_uri
    )
    batches = documentai_toolbox.gcs_utilities.create_batches(
        gcs_bucket_name, gcs_prefix, batch_size=batch_size
    )

    operation_names: List[str] = []

    print(f"{len(batches)} batches created.")
    for batch in batches:
        print(f"{len(batch.gcs_documents.documents)} files in batch.")
        print(batch.gcs_documents.documents)
        request = documentai.BatchProcessRequest(
            name=name,
            input_documents=batch,
            document_output_config=output_config,
            skip_human_review=skip_human_review,
        )

        # https://cloud.google.com/document-ai/docs/send-request?hl=en#async-processor
        # `batch_process_documents()` returns a Long Running Operation (LRO)
        operation = client.batch_process_documents(request)
        # Operation Name Format: `projects/{project_id}/locations/{location}/operations/{operation_id}`
        operation_names.append(operation.operation.name)

    return operation_names

## Retrieve results once processing is complete

- Get output [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) JSON files from `gcs_output_bucket` based on the Operation ID.

In [None]:
def retrieve_results(
    operation_names: List[str],
) -> List[documentai_toolbox.document.Document]:
    # Can do this asynchronously to avoid blocking
    all_documents: List[documentai_toolbox.document.Document] = []

    for operation in operation_names:
        # https://cloud.google.com/document-ai/docs/long-running-operations
        print(f"Waiting for operation {operation}")
        documents: List = (
            documentai_toolbox.document.Document.from_batch_process_operation(
                location=location, operation_name=operation
            )
        )
        all_documents.extend(documents)
    return all_documents

## Print results

- Export extracted entities as dictionary
- Load into Pandas DataFrame
- Print Dataframe

In [7]:
operation_names = batch_process_toolbox(
    project_id,
    location,
    processor_id,
    processor_version_id,
    gcs_input_uri,
    gcs_output_uri,
    batch_size,
    field_mask,
)

documents = retrieve_results(operation_names)

for document in documents:
    # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document#google_cloud_documentai_toolbox_wrappers_document_Document_entities_to_dict
    entities = document.entities_to_dict()
    # Optional: Export to BQ
    # job = document.entities_to_bigquery(dataset_name, table_name, project_id=project_id)

    df = pd.DataFrame([entities])

    print(tabulate(df, headers="keys", tablefmt="psql"))

Waiting for operation projects/908687846511/locations/us/operations/10629941502076889990
+----+----------------------------------------------+------------------+----------------------------+------------------------------------+-----------------------+----------------+
|    | contract_title                               | execution_date   | jurisdiction_governed_by   | legal_parties                      | renewal_term_length   | term_length    |
|----+----------------------------------------------+------------------+----------------------------+------------------------------------+-----------------------+----------------|
|  0 | WORLDWIDE LICENSE AND DISTRIBUTION AGREEMENT | August 6, 2015   | New York.                  | ['Cymbal Inc.', 'B-Cafetal, Inc.'] | five (5) year         | Ten (10) years |
+----+----------------------------------------------+------------------+----------------------------+------------------------------------+-----------------------+----------------+
