# Extending Entity Bounding Box


* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. 


## Objective

This document guides how to maximize the bounding boxes of the entities using the parsed jsons and this updated documents can be used for training processor which covers maximum area of entities and gives better results.

## Prerequisites

* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Storage Bucket for storing input and output json files
* Permission For Google Storage and Vertex AI Notebook.



## Step by Step procedure

### 1. Importing Required Modules

In [None]:
!pip install pandas numpy google-cloud-storage google-cloud-documentai==2.16.0 PyPDF2 configparser
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [3]:
# importing libraries
import cv2
import numpy as np
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from pathlib import Path
import cv2
import numpy as np
import signal
import io
from PIL import Image
import json
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
from utilities import *

### 2. Input and Output Paths

In [5]:
# Replace the GCS storage path of parsed jsons in the below input_path
input_path = "gs://xxxx/xxxx/xxx"
# Replace the GCS storage path where the updated jsons have to be stored
output_path = "gs://xxx/xxxxx/xxxx/"

* `input_path` : GCS Input Path. It should contain DocAI processed output json files. 
* `output_path` : GCS Output Path. The updated jsons will be saved in output path. 



### 3. Run the Code

In [None]:
# functions
def get_expanded_coordinates(
    binary: np.ndarray,
    bbox_coords: Tuple[Tuple[int, int], Tuple[int, int]],
    open_cv_image: np.ndarray,
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
    """
    Expand the bounding box until black pixels are detected in the specified binary image.

    Parameters:
    - binary (numpy.ndarray): Binary image containing the object of interest.
    - bbox_coords (tuple): Tuple containing the coordinates of the bounding box in the format ((x1, y1), (x2, y2)).
    - open_cv_image (numpy.ndarray): Original image (in OpenCV format) for dimension reference.

    Returns:
    Tuple: Updated bounding box coordinates in the format ((new_x1, new_y1), (new_x2, new_y2)).
    """
    x, y = bbox_coords[0][0], bbox_coords[0][1]
    w, h = bbox_coords[2][0] - bbox_coords[0][0], bbox_coords[2][1] - bbox_coords[0][1]
    # print(x,y,w,h)
    # Initialize step size for scanning
    step_size = 1  # pixel

    # Initialize padding
    padding = [3, 3, 3, 3]  # top, bottom, left, right

    # Define the threshold for stopping (number of black pixels encountered)
    black_pixel_threshold = 5  # This is an arbitrary threshold for when to stop

    # Start the timeout counter (2 seconds)
    signal.alarm(2)

    class TimeoutException(Exception):
        pass

    # Timeout handler function
    def timeout_handler(signum, frame):
        raise TimeoutException

    # Apply the timeout handler for the signal.SIGALRM
    signal.signal(signal.SIGALRM, timeout_handler)

    try:
        # Expand the bounding box until black pixels are detected
        # ... Insert the dynamic enlargement process here ...

        # Expand the bounding box until black pixels are detected
        for direction in ["top", "bottom", "left", "right"]:
            expanded = False
            while not expanded:
                if direction == "top":
                    scan_line = binary[max(y - padding[0] - step_size, 0), x : x + w]
                    padding[0] += step_size
                elif direction == "bottom":
                    scan_line = binary[
                        min(y + h + padding[1], open_cv_image.shape[0] - 1), x : x + w
                    ]
                    padding[1] += step_size
                elif direction == "left":
                    scan_line = binary[y : y + h, max(x - padding[2] - step_size, 0)]
                    padding[2] += step_size
                else:  # 'right'
                    scan_line = binary[
                        y : y + h, min(x + w + padding[3], open_cv_image.shape[1] - 1)
                    ]
                    padding[3] += step_size

                # Check if black pixels exceed the threshold
                if np.sum(scan_line == 0) > black_pixel_threshold:
                    expanded = True

        # Calculate the new bounding box with the added padding
        x_new = max(x - padding[2], 0)
        y_new = max(y - padding[0], 0)
        w_new = min(w + padding[2] + padding[3], open_cv_image.shape[1] - x_new)
        h_new = min(h + padding[0] + padding[1], open_cv_image.shape[0] - y_new)

        # Draw the new bounding box on the image
        # cv2.rectangle(image, (x_new, y_new), (x_new + w_new, y_new + h_new), (0, 0, 255), 2)

    except TimeoutException:
        print("Processing entity {entity['pageAnchor']} took too long and was skipped.")
    finally:
        # Cancel the alarm
        signal.alarm(0)

    # Calculate the new bounding box with the added padding
    x_new = max(x - padding[2], 0)
    y_new = max(y - padding[0], 0)
    w_new = min(w + padding[2] + padding[3], open_cv_image.shape[1] - x_new)
    h_new = min(h + padding[0] + padding[1], open_cv_image.shape[0] - y_new)

    # Draw the new bounding box on the image
    # cv2.rectangle(image, (x_new, y_new), (x_new + w_new, y_new + h_new), (0, 0, 255), 2)
    return (x_new, y_new), (x_new + w_new, y_new + h_new)


def create_image_bbox(
    updated_coordinates: Dict[str, List[Tuple[Tuple[int, int], Tuple[int, int]]]],
    p1: str,
    open_cv_image_1: np.ndarray,
) -> None:
    """
    Draw rectangles on the input image based on the updated coordinates.

    Parameters:
    - updated_coordinates (dict): A dictionary containing 'extended' and 'actual' keys.
        - 'extended': List of tuples representing extended bounding box coordinates.
        - 'actual': List of tuples representing actual bounding box coordinates.
    - p1 (str): Identifier for the image.
    - open_cv_image_1 (numpy.ndarray): Input image in OpenCV format.

    Returns:
    None
    """
    for i in updated_coordinates["extended"]:
        # print(i)
        cv2.rectangle(open_cv_image_1, i[0], i[1], (0, 0, 255), 2)

    for actual_ords in updated_coordinates["actual"]:
        x_coords, y_coords = zip(*actual_ords)

        # Find the minimum and maximum x and y coordinates
        min_x = min(x_coords)
        max_x = max(x_coords)
        min_y = min(y_coords)
        max_y = max(y_coords)
        cv2.rectangle(open_cv_image_1, (min_x, min_y), (max_x, max_y), (255, 0, 0), 2)

    cv2.imwrite(f"paystub_{p1}.jpg", open_cv_image_1)


def update_extended_bbox(json_data: documentai.Document) -> documentai.Document:
    """
    Update the extended bounding box coordinates in the input DocumentAI JSON data.

    Parameters:
    - json_data (documentai.Document): DocumentAI JSON data.

    Returns:
    documentai.Document: Updated DocumentAI JSON data with extended bounding box coordinates.
    """

    x, y = create_pdf_bytes_from_json(documentai.Document.to_dict(json_data))

    def extend_coordinates(entity: documentai.Document.Entity) -> None:
        """
        Extend the bounding box coordinates of the given entity.

        Parameters:
        - entity (documentai.Document.Entity): DocumentAI entity.

        Returns:
        None
        """
        bbox_coords = []
        bound_poly = entity.page_anchor.page_refs
        coordinates_xy = bound_poly[0].bounding_poly.normalized_vertices
        for i in coordinates_xy:
            bbox_coords.append(
                (int(i.x * open_cv_image.shape[1]), int(i.y * open_cv_image.shape[0]))
            )
        if len(bbox_coords) == 0:
            return
        page_ref = entity.page_anchor.page_refs
        for p in page_ref:
            page = p.page
        if int(page) == p1:
            sorted_bbox_coords = sorted(
                bbox_coords, key=lambda coord: (coord[1], coord[0])
            )
            sorted_bbox_coords[-2], sorted_bbox_coords[-1] = (
                sorted_bbox_coords[-1],
                sorted_bbox_coords[-2],
            )
            extended_bbox = get_expanded_coordinates(
                binary, sorted_bbox_coords, open_cv_image
            )
            xmin = (extended_bbox[0][0]) / open_cv_image.shape[1]
            ymin = (extended_bbox[0][1]) / open_cv_image.shape[0]
            xmax = (extended_bbox[1][0]) / open_cv_image.shape[1]
            ymax = (extended_bbox[1][1]) / open_cv_image.shape[0]
            coordinates_extended = [
                {"x": xmin, "y": ymin},
                {"x": xmax, "y": ymin},
                {"x": xmax, "y": ymax},
                {"x": xmin, "y": ymax},
            ]
            entity.page_anchor.page_refs[
                0
            ].bounding_poly.normalized_vertices = coordinates_extended

    for p1 in range(len(y)):
        open_cv_image_1 = np.array(y[p1].convert("RGB"))
        open_cv_image = np.array(y[p1].convert("RGB"))
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

        edges = cv2.Canny(gray, 50, 150)

        # Dilate the edges to thicken them
        dilated_edges = cv2.dilate(edges, (3, 3), iterations=1)

        # Find the contours in the dilated edges
        contours, _ = cv2.findContours(
            dilated_edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        # Draw the contours on the original image in black
        for contour in contours:
            cv2.drawContours(open_cv_image, [contour], -1, (0, 0, 0), 2)

        _, binary = cv2.threshold(open_cv_image, 128, 255, cv2.THRESH_BINARY)

        updated_coordinates = {"actual": [], "extended": []}

        for entity in json_data.entities:
            if len(entity.properties) > 0:
                for subentity in entity.properties:
                    extend_coordinates(subentity)
                extend_coordinates(entity)
            else:
                extend_coordinates(entity)

    return json_data


# calling functions
file_name_list, file_path_dict = file_names(input_path)

for n in range(len(file_name_list)):
    file_path = (
        "gs://" + input_path.split("/")[2] + "/" + file_path_dict[file_name_list[n]]
    )
    print(file_name_list[n])
    json_data = documentai_json_proto_downloader(
        file_path.split("/")[2], ("/").join(file_path.split("/")[3:])
    )
    json_updated = update_extended_bbox(json_data)
    store_document_as_json(
        documentai.Document.to_json(json_updated),
        output_path.split("/")[2],
        ("/").join(output_path.split("/")[3:]) + "/" + file_name_list[n],
    )

### 4.Output

The bounding boxes of the entities will be enlarged to the nearest text or line.

If we Visualize both the bounding box ..looks like below image



<img src="./Images/output.png" width=800 height=400></img>



### 5.Limitations
* This script works better if we have defined boxes or lines in the form  like shown in the above image
* If there multiple entities very close to each other , the script might not work as expected like below 
