# Combine Two processor Output Tool

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective
The objective of the tooling is to efficiently integrate the output of one AI processor (proto) with another. This integration results in a comprehensive final output that reflects the combined capabilities of both parsers. Technically, this process involves sending the document proto object from one parser to the next.

## Prerequisites 
* Python : Jupyter notebook (Vertex AI) or Google Colab 
* Permission to the Google project and Document AI 
* Input PDF Files

## Sync Code

### Importing Required Modules

In [None]:
import os
from google.api_core.client_options import ClientOptions
import google.auth.transport.requests
from google import auth
from google.cloud import documentai
from google.cloud import storage
import requests
import json
import mimetypes
from typing import List, Tuple

### Setup the required inputs

In [None]:
# First and Second Processor Configuration Details
# Replace with your Google Cloud Project ID
PROJECT_ID = "<YOUR_FIRST_PROCESSOR_PROJECT_ID>"  # e.g., "project-123"
# Specify the location for the first processor
LOCATION = "<LOCATION_FOR_FIRST_PROCESSOR>"  # e.g., "us"
# Replace with the ID of your first processor
PROCESSOR_ID = "<YOUR_FIRST_PROCESSOR_ID>"  # e.g., "1234abcd"

# The MIME type for the files to be processed
MIME_TYPE = "application/pdf"  # Keep as-is if processing PDF files

# Configuration for the second processor
# Replace with your Google Cloud Project ID for the second processor
PROJECT_ID_2 = "<YOUR_SECOND_PROCESSOR_PROJECT_ID>"  # e.g., "project-456"
# Specify the location for the second processor
LOCATION_2 = "<LOCATION_FOR_SECOND_PROCESSOR>"  # e.g., "us"
# Replace with the ID of your second processor
PROCESSOR_ID_2 = "<YOUR_SECOND_PROCESSOR_ID>"  # e.g., "5678efgh"

# Google Cloud Storage Bucket Paths
# Specify the path to the input PDF files
input_path = "<PATH_TO_INPUT_PDF_FILES>"  # e.g., "bucket/input_pdf/" gs:// is not required at the beginning.
# Specify the path for output from the first parser
output_path1 = "<PATH_FOR_FIRST_PARSER_OUTPUT>"  # e.g., "bucket/first_parser_output" gs:// is not required at the beginning.
# Specify the path for output from the second parser
output_path2 = "<PATH_FOR_SECOND_PARSER_OUTPUT>"  # e.g., "bucket/second_parser_output"  gs:// is not required at the beginning.

* `PROJECT_ID :`  Google Cloud Project ID for the first processor.
* `LOCATION :`  Google Cloud project location for the first processor.
* `PROCESSOR_ID :`  Processor ID from the first processor.
* `MIME_TYPE :`   The MIME type for the files to be processed.
* `PROJECT_ID_2 :` Google Cloud Project ID for the second processor.
* `LOCATION_2 :`  Google Cloud project location for the second processor.
* `PROCESSOR_ID_2 :`  Processor ID from the second processor.
* `input_path :`  The path to the input PDF files.
* `output_path1 :`  The path for output from the first parser.
* `output_path2 :`  The path for output from the second parser.

### Run the Code

In [None]:
# get credentials of current user / service account
def get_access_token() -> str:
    """
    Retrieves the access token for authentication.

    Returns:
        str: The access token.
    """

    credentials, _ = auth.default()
    credentials.refresh(google.auth.transport.requests.Request())
    return credentials.token


def list_files(bucket_name: str, prefix: str) -> List[str]:
    """
    Lists all files in a Google Cloud Storage (GCS) bucket with the given prefix.

    Args:
        bucket_name (str): The name of the GCS bucket.
        prefix (str): The prefix to filter files in the bucket.

    Returns:
        List[str]: A list of file names in the bucket with the specified prefix.
    """

    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    return [blob.name for blob in blobs]


def download_blob(
    bucket_name: str, source_blob_name: str, destination_file_name: str
) -> str:
    """
    Downloads a blob from a GCS bucket.

    Args:
        bucket_name (str): The name of the GCS bucket.
        source_blob_name (str): The name of the source blob.
        destination_file_name (str): The name of the destination file.

    Returns:
        str: The path to the downloaded file.
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    return destination_file_name


def upload_blob(
    bucket_name: str, source_file_name: str, destination_blob_name: str
) -> None:
    """
    Uploads a file to a GCS bucket.

    Args:
        bucket_name (str): The name of the GCS bucket.
        source_file_name (str): The name of the source file.
        destination_blob_name (str): The name of the destination blob.
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)


def get_bucket_and_prefix(full_path: str) -> Tuple[str, str]:
    """
    Extracts the bucket name and prefix from a full path.

    Args:
        full_path (str): The full path containing the bucket name and prefix.

    Returns:
        Tuple[str, str]: A tuple containing the bucket name and prefix.
    """

    parts = full_path.split("/")
    bucket_name = parts[0]
    prefix = "/".join(parts[1:])
    return bucket_name, prefix


def second_processer_calling(
    document: object, PROJECT_ID_2: str, LOCATION_2: str, PROCESSOR_ID_2: str
) -> dict:
    """
    Calls the second Document AI processor to process the document.

    Args:
        document (object): The document to be processed.
        PROJECT_ID_2 (str): The Google Cloud project ID.
        LOCATION_2 (str): The location of the Document AI processor.
        PROCESSOR_ID_2 (str): The ID of the Document AI processor.

    Returns:
        dict: The JSON response from the second processor.
    """

    print("Processing through Second Parser")
    url = f"https://us-documentai.googleapis.com/v1/projects/{PROJECT_ID_2}/locations/{LOCATION_2}/processors/{PROCESSOR_ID_2}:process"
    headers = {"Authorization": f"Bearer {get_access_token()}"}
    json_data = documentai.Document.to_json(document)
    json_data_dict = json.loads(json_data)  # Parse the JSON string to a dictionary

    create_process_request = {"inlineDocument": json_data_dict}
    create_processor_response = requests.post(
        url, headers=headers, json=create_process_request
    )
    create_processor_response.raise_for_status()
    json_object = create_processor_response.json()
    return json_object["document"]


def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_content: bytes,
    mime_type: str,
) -> object:
    """
    Process a document with the given MIME type and content using the Document AI processor.

    Args:
        project_id (str): The Google Cloud project ID.
        location (str): The location of the Document AI processor.
        processor_id (str): The ID of the Document AI processor.
        file_content (bytes): The content of the document.
        mime_type (str): The MIME type of the document content.

    Returns:
        object: The processed Document AI document.
    """

    print("Processing through First Parser")
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the document
    result = documentai_client.process_document(request=request)

    return result.document


def process_files_in_bucket(input_path: str, output_path1: str, output_path2: str):
    """
    Processes files in a bucket and saves output to other buckets.

    Args:
        input_path (str): The path to the input bucket and prefix.
        output_path1 (str): The path to the output bucket and prefix for the first parser output.
        output_path2 (str): The path to the output bucket and prefix for the second parser output.
    """

    storage_client = storage.Client()
    input_bucket_name, input_prefix = get_bucket_and_prefix(input_path)
    output_bucket1_name, output_prefix1 = get_bucket_and_prefix(output_path1)
    output_bucket2_name, output_prefix2 = get_bucket_and_prefix(output_path2)
    input_bucket = storage_client.bucket(input_bucket_name)

    for blob in input_bucket.list_blobs(prefix=input_prefix):
        if (
            not blob.name.endswith("/") and blob.name != input_prefix
        ):  # Skip directories and the prefix itself
            file_name = blob.name
            content = blob.download_as_bytes()

            # Removing the original extension and appending .json
            base_file_name = os.path.splitext(os.path.basename(file_name))[0] + ".json"

            print("Processing file:", file_name)  # Debug print
            mime_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
            print("Detected MIME type:", mime_type)  # Debug print

            # Process with first parser
            first_parser_output = online_process(
                PROJECT_ID, LOCATION, PROCESSOR_ID, content, mime_type
            )
            first_parser_output_json = json.loads(
                documentai.Document.to_json(first_parser_output)
            )

            # Save first parser output to output_bucket1
            output_blob1 = storage.Blob(
                output_prefix1 + "/" + base_file_name,
                storage_client.bucket(output_bucket1_name),
            )
            output_blob1.upload_from_string(
                json.dumps(first_parser_output_json, indent=2),
                content_type="application/json",
            )

            # Process with second parser
            second_parser_output = second_processer_calling(
                first_parser_output, PROJECT_ID_2, LOCATION_2, PROCESSOR_ID_2
            )

            # Save second parser output to output_bucket2
            output_blob2 = storage.Blob(
                output_prefix2 + "/" + base_file_name,
                storage_client.bucket(output_bucket2_name),
            )
            output_blob2.upload_from_string(
                json.dumps(second_parser_output, indent=2),
                content_type="application/json",
            )


# Call the function
process_files_in_bucket(input_path, output_path1, output_path2)
print("Done")

### Async Code

### Importing Required Modules

In [None]:
import os
import google.auth.transport.requests
from google import auth
from google.cloud import documentai
from google.cloud import storage
import requests
import json
import mimetypes
import asyncio
import aiohttp

### Run the Code

In [None]:
# get credentials of current user / service account
def get_access_token() -> str:
    """
    Retrieves the access token for authentication.

    Returns:
        str: The access token.
    """

    credentials, _ = auth.default()
    credentials.refresh(google.auth.transport.requests.Request())
    return credentials.token


def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    timeout: int = 6000,
) -> object:
    """
    Batch process documents using the Document AI processor.

    Args:
        project_id (str): The Google Cloud project ID.
        location (str): The location of the Document AI processor.
        processor_id (str): The ID of the Document AI processor.
        gcs_input_uri (str): The URI of the input documents in Google Cloud Storage.
        gcs_output_uri (str): The URI of the output documents in Google Cloud Storage.
        timeout (int): Timeout for the operation in seconds. Defaults to 6000.

    Returns:
        documentai.BatchDocumentsResponse: The response object containing the batch processing operation result.
    """

    from google.cloud import documentai_v1beta3 as documentai

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}
    elif location == "us":
        opts = {"api_endpoint": "us-documentai.googleapis.com"}
        # opts = {"api_endpoint": "us-autopush-documentai.sandbox.googleapis.com"}
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    destination_uri = f"{gcs_output_uri}/"

    input_config = documentai.BatchDocumentsInputConfig(
        gcs_prefix=documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config={"gcs_uri": destination_uri}
    )

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=timeout)
    return operation

In [None]:
res = batch_process_documents(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    gcs_input_uri=f"gs://{input_path}",
    gcs_output_uri=f"gs://{output_path1}",
)

In [None]:
async def second_processer_calling(
    document: object,
    PROJECT_ID_2: str,
    LOCATION_2: str,
    PROCESSOR_ID_2: str,
    session: aiohttp.ClientSession,
) -> object:
    """
    Asynchronously calls the second Document AI processor.

    Args:
        document (object): The document to process.
        PROJECT_ID_2 (str): The Google Cloud project ID for the second processor.
        LOCATION_2 (str): The location of the second processor.
        PROCESSOR_ID_2 (str): The ID of the second processor.
        session (aiohttp.ClientSession): The aiohttp session for making HTTP requests.

    Returns:
        object: The processed document.
    """

    print("Processing through Second Parser")
    url = f"https://us-documentai.googleapis.com/v1/projects/{PROJECT_ID_2}/locations/{LOCATION_2}/processors/{PROCESSOR_ID_2}:process"
    headers = {"Authorization": f"Bearer {get_access_token()}"}
    # json_data = documentai.Document.to_json(document)
    # json_data_dict = json.loads(json_data)

    create_process_request = {"inlineDocument": document}
    async with session.post(
        url, headers=headers, json=create_process_request
    ) as response:
        response.raise_for_status()
        json_object = await response.json()
        return json_object["document"]


async def save_to_gcs(bucket: str, blob_name: str, data: str) -> None:
    """
    Asynchronously saves data to Google Cloud Storage.

    Args:
        bucket (str): The Google Cloud Storage bucket.
        blob_name (str): The name of the blob.
        data (str): The data to save.
    """

    blob = bucket.blob(blob_name)
    blob.upload_from_string(data)


async def main():
    # Splitting the bucket name and prefix from output_path1
    bucket_name, prefix = output_path1.split("/", 1)

    # Fetch JSON from output_path1 and its subfolders
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    # Initialize aiohttp session
    async with aiohttp.ClientSession() as session:
        tasks = []
        original_filenames = []

        for blob in blobs:
            if blob.name.endswith(".json"):
                json_string = blob.download_as_string()
                document = json.loads(json_string)
                original_filenames.append(os.path.basename(blob.name))

                # Schedule asynchronous processing
                task = asyncio.ensure_future(
                    second_processer_calling(
                        document, PROJECT_ID_2, LOCATION_2, PROCESSOR_ID_2, session
                    )
                )
                tasks.append(task)

        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks)

        # Save results to output_path2 with the same original file names
        _, output_prefix = output_path2.split("/", 1)
        for filename, result in zip(original_filenames, results):
            result_json = json.dumps(result)
            blob_name = f"{output_prefix}/{filename}"
            await save_to_gcs(bucket, blob_name, result_json)


await main()