# Document AI Processor Visual Assessment Tool

* Author: docai-incubator@google.com

## Disclaimer
This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Objective

The Python script is designed to produce an Excel report from the provided PDFs using a chosen Document AI processor. This report offers a snapshot of the entity extraction by the processor from the PDFs and visually presents the bounding boxes within the images in the Excel sheet. 
For the Form Parser, the report will detail the key-value pairs and the table structures. For other processors, the report will showcase the predicted entity types paired with the corresponding mention text.

## Prerequisites 
   * Python : Jupyter notebook (Vertex AI)
   * Permission to the Google project is needed and ccess to Document AI processor


## Step by step procedure

### Install the required Libraries

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install pandas
!pip install pillow
!pip install opencv-python
!pip install google-cloud-documentai==2.16.0
!pip install google-cloud-storage
!pip install xlsxwriter

### Import the libraries

In [5]:
import os
import time
import json
import io
from io import BytesIO
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import cv2
import base64
import tempfile
from os.path import splitext
import numpy as np
from google.cloud import documentai_v1beta3
from google.cloud import storage
from typing import List, Sequence, Dict, Any, Tuple, Optional
from pprint import pprint
import utilities

### Setup the required inputs

In [None]:
project_id = "<your-project-id>"
processor_id = "<your-processor-id>"
bucket_name = "<bucket-name>"
input_pdfs_path = "gs://<bucket-name>/<subfolder-name>"

 * `project_id`: The ID of the project where your Document AI processor and storage bucket are located.
 * `processor_id`: The ID of the processor you intend to use for evaluation.
 * `bucket_name`: The name of the bucket where batch processing results will be stored.
 * `input_pdfs_path`: The Google Cloud Storage (gs) path for the PDFs you wish to process with the Document AI.

In [None]:
def get_processor_type(project_id_: str, processor_id_: str) -> Tuple[str, str]:
    """
    Retrieves the display name and type of a specific Document AI processor.

    Args:
    project_id_ (str): The project ID in Google Cloud.
    processor_id_ (str): The ID of the processor within the specified project.

    Returns:
    Tuple[str, str]: A tuple containing the display name and type of the processor.
    """
    client = documentai_v1beta3.DocumentProcessorServiceClient()
    request = documentai_v1beta3.GetProcessorRequest(
        name=f"projects/{project_id_}/locations/us/processors/{processor_id_}",
    )
    response = client.get_processor(request=request)
    return response.display_name, response.type_


def convert_base64_to_image(base64_text):
    """
    Converts a base64 encoded text to an image.

    Args:
    base64_text (str): A string containing the base64 encoded data of an image.
                      It can optionally start with 'data:image/png;base64,'.

    Returns:
    Image: An image object created from the base64 encoded data.
    """
    try:
        image = Image.open(io.BytesIO(base64_text))
        return image
    except IOError:
        print("Error in loading the image. The image data might be corrupted.")
        return None


def draw_cells(
    cells: List[Any], image: Image, draw: ImageDraw, color: str, border_width: int
) -> None:
    """
    Draws borders around specified cells on an image.

    Args:
    cells (List[Any]): A list of cell objects, each containing layout information.
    image (Image): The image object on which the cells will be drawn.
    draw (ImageDraw): An ImageDraw object to draw on the image.
    color (str): The color of the border.
    border_width (int): The width of the border around each cell.

    The function does not return anything but modifies the given ImageDraw object.
    """
    for cell in cells:
        try:
            # Extract vertices assuming they are provided in a normalized form
            vertices = [
                (v.x * image.width, v.y * image.height)
                for v in cell.layout.bounding_poly.normalized_vertices
            ]

            # Draw borders of specified width
            for i in range(border_width):
                border_vertices = [(v[0] - i - 1, v[1] - i - 1) for v in vertices]
                draw.polygon(border_vertices, outline=color)
        except AttributeError:
            # Skip if the necessary attributes are not present
            pass


def draw_kvp(
    kvp: Any, image: Image, draw: ImageDraw, color: str, border_width: int
) -> None:
    """
    Draws a polygon around the provided key-value pair (KVP) on an image.

    Args:
    kvp (Any): An object representing a key-value pair, containing bounding polygon vertices.
    image (Image): The image object on which the KVP will be drawn.
    draw (ImageDraw): An ImageDraw object to draw on the image.
    color (str): The color of the polygon's outline.
    border_width (int): The width of the polygon's outline.

    The function does not return anything but modifies the given ImageDraw object.
    """
    try:
        # Extract vertices assuming they are provided in a normalized form
        vertices = [
            (v.x * image.width, v.y * image.height)
            for v in kvp.bounding_poly.normalized_vertices
        ]

        # Draw the polygon with the specified color and border width
        draw.polygon(vertices, outline=color, width=border_width)
    except AttributeError:
        # Skip if the necessary attributes are not present
        pass


def get_kvp_data(kvp_fields: List[Any], text: str) -> Dict[str, str]:
    """
    Extracts key-value pair data from a list of fields and returns it as a dictionary.

    Args:
    kvp_fields (List[Any]): A list of objects, each representing a key-value pair.
                            Each object contains 'field_name' and 'field_value' properties.
    text (str): The text in which these key-value pairs are found.

    Returns:
    Dict[str, str]: A dictionary where each key is the name of the field and its value is the corresponding field value.
    """
    kvp_dict = {}

    for field in kvp_fields:
        name = text_anchor_to_text(field.field_name.text_anchor, text).strip()
        value = text_anchor_to_text(field.field_value.text_anchor, text).strip()
        kvp_dict[name] = value
    return kvp_dict


def get_table_data(rows: Sequence[Any], text: str) -> List[List[str]]:
    """
    Extracts and organizes text data from table rows.

    Args:
    rows (Sequence[Any]): A sequence of row objects, each containing cells with layout information.
    text (str): The text in which the table data is found.

    Returns:
    List[List[str]]: A list of lists, where each inner list contains the text data of a single row in the table.
    """
    all_values: List[List[str]] = []
    for row in rows:
        current_row_values: List[str] = []
        for cell in row.cells:
            current_row_values.append(
                text_anchor_to_text(cell.layout.text_anchor, text)
            )
        all_values.append(current_row_values)
    return all_values


def text_anchor_to_text(text_anchor: Any, text: str) -> str:
    """
    Converts offsets in a text anchor to a string.

    Args:
    text_anchor (Any): An object representing a text anchor with text segments.
    text (str): The complete text from which to extract the specified range.

    Returns:
    str: The extracted text corresponding to the range specified in the text anchor.
         Newline characters in the extracted text are replaced with spaces.
    """
    response = ""
    text_segments = text_anchor.text_segments
    for segment in text_segments:
        start_index = segment.start_index if hasattr(segment, "start_index") else 0
        end_index = segment.end_index
        response += text[start_index:end_index]
    return response.strip().replace("\n", " ")


def draw_bounding_box(
    draw: ImageDraw,
    vertices: List[Tuple[float, float]],
    image: Image,
    color: str = "blue",
    scale_factor: float = 1.05,
) -> None:
    """
    Draws a bounding box or polygon based on the provided vertices on an image.

    Args:
    draw (ImageDraw): The ImageDraw instance to draw on the image.
    vertices (List[Tuple[float, float]]): List of tuples representing normalized coordinates (x, y) of the vertices.
    image (Image): The image on which to draw the bounding box or polygon.
    color (str, optional): The color of the bounding box or polygon. Defaults to "blue".
    scale_factor (float, optional): Factor to scale the bounding box or polygon. Defaults to 1.05.

    The function doesn't return anything but modifies the ImageDraw object to include the bounding box or polygon.
    """
    width, height = image.size
    scaled_vertices = [
        (x * width * scale_factor, y * height * scale_factor) for x, y in vertices
    ]

    if len(scaled_vertices) > 2:
        # Draw polygon if more than 2 vertices
        draw.polygon(scaled_vertices, outline=color, width=3)
    elif len(scaled_vertices) == 2:
        # Draw rectangle if exactly 2 vertices
        draw.rectangle(scaled_vertices, outline=color, width=3)
    else:
        # Handle cases with invalid number of vertices
        pass  # Or log a warning message


def generate_data_for_file(
    document_object: Any, blob_name: str, writer: pd.ExcelWriter
):
    """
    Processes the content of a provided document object and generates data for an Excel file.
    This function is specifically tailored for visualizing results from a form parser,
    which includes key-value pairs and tables. It generates images with highlighted areas
    corresponding to these elements and inserts them into the Excel file.

    Args:
    - document_object (Any): The document object containing parsed data from the form parser.
    - blob_name (str): The name of the blob (file) being processed.
    - writer (pd.ExcelWriter): An Excel writer object for writing data to an Excel file.

    This function processes tables and key-value pairs (KVP) in the document,
    creating a DataFrame for each and appending them to an Excel sheet. It also generates
    images with bounding boxes around detected entities (both KVP and table cells),
    which are then inserted into the Excel file.
    """
    file_path = blob_name

    # Create a list to store flattened DataFrames
    dfs = []

    # Initialize variables to keep track of page and table numbers
    page_number = 1

    # Flatten and append tables to the list with titles and gaps
    for page in document_object.pages:
        tables = page.tables
        table_number = 1  # Reset table number for each page
        for index, table in enumerate(tables):
            body_rows = list(table.body_rows)
            header_rows = list(table.header_rows)

            # print(type(header_rows))
            # Combine header and body rows
            all_rows = header_rows + body_rows

            # print(all_rows)
            # Extract cell values from rows
            table_data = [
                get_table_data([row], document_object.text)[0] for row in all_rows
            ]
            df = pd.DataFrame(data=table_data)

            # Insert table title as the first row in the DataFrame
            table_title = f"Table {table_number} Page {page_number}"
            df.loc[-1] = [table_title] + [""] * (df.shape[1] - 1)
            df.index = df.index + 1
            df = df.sort_index()

            dfs.append(df)

            table_number += 1

        page_number += 1

    combined_df = pd.concat(dfs, ignore_index=True)

    # Handle KVP
    all_kvp_data = []
    page_number = 1  # Initialize page number
    for page in document_object.pages:
        kvp_fields = page.form_fields
        if kvp_fields:
            kvp_fields_values = get_kvp_data(kvp_fields, document_object.text)
            # Add page number to each KVP entry
            for item in kvp_fields_values.items():
                all_kvp_data.append((item[0], item[1], page_number))
        page_number += 1  # Increment page number for the next iteration

    kvp_df = pd.DataFrame(all_kvp_data, columns=["Name", "Value", "Page Number"])

    colors = ["red", "blue", "green", "orange", "purple"]
    color_idx = 0
    border_width = 5
    temp_image_files = []

    for index, page in enumerate(document_object.pages):
        # print(f"Processing page index: {index}")
        base64_text = page.image.content
        # print(base64_text)
        image = convert_base64_to_image(base64_text)
        draw = ImageDraw.Draw(image)

        # DRAW KVP
        color = "black"
        kvp_fields = page.form_fields
        if kvp_fields:  # Check if kvp_fields is not empty
            for kvp in kvp_fields:
                draw_kvp(kvp.field_name, image, draw, color, border_width)
                draw_kvp(kvp.field_value, image, draw, color, border_width)

        tables = page.tables
        for table in page.tables:
            color = colors[color_idx]
            color_idx = (color_idx + 1) % len(colors)

            # DRAW TABLES
            for row_type in ["body_rows", "header_rows"]:
                rows = getattr(
                    table, row_type
                )  # Use getattr to dynamically access the attribute
                for row in rows:
                    draw_cells(row.cells, image, draw, color, border_width)
        # Save the image to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
            temp_image_filename = f.name
            image.save(f, "PNG")
            temp_image_files.append(temp_image_filename)  # Store path for later

    # Add data to the Excel writer
    sheet_name = os.path.splitext(os.path.basename(file_path))[0][:31]
    kvp_df.to_excel(writer, sheet_name=sheet_name, index=False)

    workbook = writer.book
    worksheet = writer.sheets[sheet_name]

    bold_format = workbook.add_format({"bold": True})
    border_format = workbook.add_format({"border": 1})
    start_row = len(kvp_df) + 2 if not kvp_df.empty else 0
    combined_df.to_excel(
        writer, sheet_name=sheet_name, startrow=start_row, index=False, header=False
    )

    row_num = start_row
    for idx, row in combined_df.iterrows():
        if "Table" in str(row[0]):  # Table title row
            worksheet.write(row_num, 0, row[0], bold_format)
            row_num += 1
            continue  # skip to the next row

        if all(pd.isna(cell) for cell in row):  # If the entire row is NaN (gap row)
            for col_num, cell_value in enumerate(row):
                worksheet.write(
                    row_num, col_num, "", workbook.add_format()
                )  # Overwrite with an empty format
            row_num += 1
            continue

        for col_num, cell_value in enumerate(row):
            if pd.notna(cell_value):
                worksheet.write(row_num, col_num, cell_value, border_format)
        row_num += 1

    # Starting row for images
    image_row = 0
    for temp_image_filename in temp_image_files:
        worksheet.insert_image(
            image_row, 10, temp_image_filename, {"x_scale": 0.3, "y_scale": 0.3}
        )
        # Assuming the image takes up about 20 rows after resizing, adjust as needed
        image_row += 30


def generate_data_for_file_normal(
    document_object: Any, blob_name: str, writer: pd.ExcelWriter
):
    """
    Processes the provided file content from a document object and generates data for an Excel file.
    Additionally, it creates images with bounding boxes for entities found in the document,
    and inserts these images into the Excel file.

    Args:
    - document_object (Document): The Document AI object containing entities and page information.
    - blob_name (str): The name of the blob file being processed.
    - writer (pd.ExcelWriter): An Excel writer object used to write data to an Excel file.

    The function processes entities within the document, sorts them by page,
    and writes this data to an Excel sheet. For each page in the document,
    it generates an image with bounding boxes around the detected entities.
    These images are then inserted into the Excel file.
    """
    file_path = blob_name

    entities_list = []

    for entity in document_object.entities:
        if entity.properties:
            for prop in entity.properties:
                mentionText_prop = prop.mention_text
                type_prop = prop.type_
                page_prop = (
                    prop.page_anchor.page_refs[0].page
                    if prop.page_anchor.page_refs
                    else 0
                )

                entities_list.append([type_prop, mentionText_prop, page_prop])
        else:
            mentionText = entity.mention_text
            entity_type = entity.type_
            page = (
                entity.page_anchor.page_refs[0].page
                if entity.page_anchor.page_refs
                else 0
            )

            entities_list.append([entity_type, mentionText, page])

    df = pd.DataFrame(entities_list, columns=["Type", "MentionText", "Page"])
    df["Page"] = df["Page"].astype(int)
    df_sorted = df.sort_values(by="Page")

    temp_image_files = []

    # Iterate over each page in the document
    for page_index, page in enumerate(document_object.pages):
        # Assuming 'page' has an attribute 'image' which is a base64-encoded string
        base64_text = page.image.content
        image = convert_base64_to_image(base64_text)
        draw = ImageDraw.Draw(image)

        # Draw bounding boxes for entities associated with the current page
        for entity in document_object.entities:
            entity_page = (
                entity.page_anchor.page_refs[0].page
                if entity.page_anchor.page_refs
                else 0
            )

            if entity_page == page_index:
                vertices = [
                    (v.x, v.y)
                    for v in entity.page_anchor.page_refs[
                        0
                    ].bounding_poly.normalized_vertices
                ]
                draw_bounding_box(draw, vertices, image, color="blue")

                # Properties (child entities)
                for prop in entity.properties:
                    prop_vertices = [
                        (v.x, v.y)
                        for v in prop.page_anchor.page_refs[
                            0
                        ].bounding_poly.normalized_vertices
                    ]
                    draw_bounding_box(draw, prop_vertices, image)

        # Save the image to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
            temp_image_filename = f.name
            image.save(f, "PNG")
            temp_image_files.append(temp_image_filename)

    # Add data to the Excel writer
    sheet_name = os.path.splitext(os.path.basename(file_path))[0][:31]
    df_sorted.to_excel(writer, sheet_name=sheet_name, index=False)

    workbook = writer.book
    worksheet = writer.sheets[sheet_name]

    image_row = 0
    for temp_image_filename in temp_image_files:
        worksheet.insert_image(
            image_row, 10, temp_image_filename, {"x_scale": 0.3, "y_scale": 0.3}
        )
        image_row += 30


def generate_parser_viz_for_folder(bucket_path, processor_type):
    storage_client = storage.Client()
    # Parsing bucket name and prefix from the bucket path
    if bucket_path.startswith("gs://"):
        bucket_path = bucket_path[5:]
    bucket_name, prefix = bucket_path.split("/", 1)
    bucket = storage_client.get_bucket(bucket_name)

    with pd.ExcelWriter(f"{processor_type}_report.xlsx", engine="xlsxwriter") as writer:
        for blob in bucket.list_blobs(prefix=prefix):
            try:
                if blob.name.endswith(".json"):
                    file_content = blob.download_as_text()
                    json_data = json.loads(file_content)
                    document_object = documentai_v1beta3.Document.from_json(
                        json.dumps(json_data)
                    )
                    # print(type(document_object))
                    if processor_type == "FORM_PARSER_PROCESSOR":
                        generate_data_for_file(document_object, blob.name, writer)
                    else:
                        generate_data_for_file_normal(
                            document_object, blob.name, writer
                        )
            except Exception as e:
                print(e)
                print(f"Skipped - {blob.name}")
                continue


processor_display_name, processor_type = get_processor_type(project_id, processor_id)
current_time = time.strftime("%Y%m%d%H%M%S")
folder_path = f"gs://{bucket_name}/{processor_display_name}_{current_time}"
xlsx_file_name = f"{processor_display_name}_{current_time}.xlsx"
print(
    f"Batch Processing the Documents with the {processor_display_name} - {processor_type}"
)
res = utilities.batch_process_documents_sample(
    project_id=project_id,
    location="us",
    processor_id=processor_id,
    gcs_input_uri=input_pdfs_path,
    gcs_output_uri=f"{folder_path}/batch_process_outputs",  # Concatenating the prefix here
)
print("Batch Process Completed")
try:
    if processor_type == "FORM_PARSER_PROCESSOR":
        print(
            f"{processor_type} Detected \nVisualizing the key-value pairs and the table structures"
        )
    else:
        print(f"{processor_type} Detected \nVisualizing the entities")
    generate_parser_viz_for_folder(folder_path, processor_type)
    print(
        f"Report is generated. Please find the Visualization in {processor_type}_report.xlsx"
    )

except Exception as e:
    import traceback

    # Print an error message
    print("Issue occurred. Please check the input field and JSON")

    # Print the traceback
    traceback.print_exc()

### Visualization Output  

<img src="./images/invoice_out.png" width=1000 height=800> </img>