# DocAI Splitting Overlapping Entities


* Author: docai-incubator@google.com


## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Purpose and Description
This tool uses exported labeled json to separate a pair of entities that are overlapped due to labeling into two individual entities. 

## Prerequisites

1. Vertex AI Notebook
2. Parsed json files in GCS Folder.
3. Output folder to upload the updated json files.

## Step by Step procedure 

### 1. Input details


In [3]:
# input details
# INPUT : storage bucket name
input_path = "gs://xxxxx/xxxxxxxx"
# OUTPUT : storage bucket's path
output_path = "gs://xxxxxx/xxxxxxxx"

list_of_pair_of_entities = [
    ("currency", "invoice_id"),
    ("purchase_order", "delivery_date"),
]  # List of pair of entities that needs to be splitted.
# Also, the entity name should be mentioned like this (small_entity,large_entity)

<ul>
    <li><b>input_path :</b> GCS Path for input json files</li>
    <li><b>output_path:</b> GCS Path for output json files</li>
    <li><b>list_of_pair_of_entities:</b> [('customer_account_name','ship_to_address')]</li>
</ul>
<div style="background-color:#f5f569" ><i><b>Note:</b> List of pairs of entities that need to be splitted. Also, the entity name should be mentioned like this (small_entity,large_entity)</i><div>

### 2. Output

The output json after execution of the code have individual entities.
<img src="./Images/overlapping_split_output_1.png" width=800 height=400 alt="Overlapping entity split output">

### 3. Run the code

In [None]:
%pip install tqdm
%pip install google.cloud

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
from io import BytesIO
import json, copy
from google.cloud import storage
from tqdm.notebook import tqdm
from utilities import (
    file_names,
    documentai_json_proto_downloader,
    bb_intersection_over_union,
    store_document_as_json,
    bbox_maker,
)
from google.cloud import documentai_v1beta3 as documentai
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union


input_storage_bucket_name = input_path.split("/")[2]
input_bucket_path_prefix = "/".join(input_path.split("/")[3:])
output_storage_bucket_name = output_path.split("/")[2]
output_bucket_path_prefix = "/".join(output_path.split("/")[3:])

json_files = file_names(input_path)[1].values()
list_of_files = [i for i in list(json_files) if i.endswith(".json")]


def get_entity_coordinates(
    entity1: documentai.Document.Entity, entity2: documentai.Document.Entity
) -> Tuple[float, float]:
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    entity1 : documentai.Document.Entity
            The first entity from the elements of list_of_pair_of_entities
    entity2 : documentai.Document.Entity
            The second entity from the elements of list_of_pair_of_entities

    Returns
    -------
    List
        entity_1_coordinates : coordinates of first entity
        entity_2_coordinates : coordinates of second entity
    """
    entity1_coordinates_list = []
    for i in entity1.page_anchor.page_refs[0].bounding_poly.normalized_vertices:
        entity1_coordinates_list.append({"x": i.x, "y": i.y})
    entity2_coordinates_list = []
    for i in entity2.page_anchor.page_refs[0].bounding_poly.normalized_vertices:
        entity2_coordinates_list.append({"x": i.x, "y": i.y})
    entity1_coordinates_list = bbox_maker(entity1_coordinates_list)
    entity2_coordinates_list = bbox_maker(entity2_coordinates_list)

    return entity1_coordinates_list, entity2_coordinates_list


def find_textSegment_list(
    x_min: int, y_min: int, x_max: int, y_max: int, js: documentai.Document, page: int
) -> List[documentai.Document.TextAnchor.TextSegment]:
    """
    To get the text segment list.

    Parameters
    ----------
    x_min : The minimum value of x coordinate (top left).
    y_min : The minimum value of y coordinate (bottom left).
    x_max : The maximum value of x coordinate (top right).
    y_max : The maximum value of y coordinate (bottom right).
    js    : documentai.Document
            The Document proto object from the entities.
    page : int
            The page number.

    Returns
    -------
    List[documentai.Document.TextAnchor.TextSegment] :
        List of text segment.
    """
    textSegments_list = []
    for token in js.pages[page].tokens:
        token_coordinates_list = []
        for i in token.layout.bounding_poly.normalized_vertices:
            token_coordinates_list.append({"x": i.x, "y": i.y})
        token_coordinates_list = bbox_maker(token_coordinates_list)
        token_xMin = token_coordinates_list[0]
        token_xMax = token_coordinates_list[2]
        token_yMin = token_coordinates_list[1]
        token_yMax = token_coordinates_list[3]
        if (
            token_xMin >= x_min
            and token_xMax <= x_max
            and token_yMin >= y_min
            and token_yMax <= y_max
        ):
            textSegments_list.extend(token.layout.text_anchor.text_segments)

    return textSegments_list


def split_overlapping_entities(
    large_entity: documentai.Document.Entity,
    small_entity: documentai.Document.Entity,
    js: documentai.Document,
    page: str,
) -> documentai.Document.Entity:
    """It will append new entities to Document Proto, whose token segments falls with in range of Header token

    Args:
        doc (documentai.Document): It is Document proto object
        total_amount_type (str): Its value is set as type for an entity, here for all properties in an entity
        list_total_amount (List[str]): It is a list of header words which will be used to identity and the values under those headers will be tagged with child type `total_amount_type`

    Returns:
        documentai.Document: It is Document proto object, which contains newly added entities as well
    """

    new_entity = documentai.Document.Entity()
    new_entity.type = large_entity.type
    new_entity.mention_text = large_entity.mention_text.replace(
        small_entity.mention_text, ""
    )
    text_anchor = documentai.Document.TextAnchor()
    text_anchor.content = large_entity.mention_text.replace(
        small_entity.mention_text, ""
    )
    small_entity_coordinates_list = []
    for i in small_entity.page_anchor.page_refs[0].bounding_poly.normalized_vertices:
        small_entity_coordinates_list.append({"x": i.x, "y": i.y})
    large_entity_coordinates_list = []
    for i in large_entity.page_anchor.page_refs[0].bounding_poly.normalized_vertices:
        large_entity_coordinates_list.append({"x": i.x, "y": i.y})
    small_entity_coordinates_list = bbox_maker(small_entity_coordinates_list)
    large_entity_coordinates_list = bbox_maker(large_entity_coordinates_list)
    A = {"x": small_entity_coordinates_list[0], "y": small_entity_coordinates_list[3]}
    B = {"x": small_entity_coordinates_list[2], "y": small_entity_coordinates_list[3]}
    C = {"x": large_entity_coordinates_list[2], "y": large_entity_coordinates_list[3]}
    D = {"x": large_entity_coordinates_list[0], "y": large_entity_coordinates_list[3]}
    new_entity.page_anchor = large_entity.page_anchor
    new_entity.page_anchor.page_refs[0].bounding_poly.normalized_vertices = [A, B, C, D]
    new_entity.page_anchor.page_refs[0].page = str(page)
    text_anchor.text_segments = find_textSegment_list(
        A["x"] - 0.005, A["y"] - 0.005, C["x"] + 0.005, C["y"] + 0.005, js, page
    )
    new_entity.text_anchor = text_anchor
    return new_entity


def update_document(
    document: documentai.Document, list_of_small_entity: List[str], large_entity: str
) -> documentai.Document:
    """This function will take list of overlapping entities and match the iou of both the entities and if found iou more than 0.0
        then it will call split_overlapping_entities() function and add the newly splitted entities to the original document object

    Args:
        document (documentai.Document): It is Document proto object which will have all the entities including overlapped entities.
        list_of_small_entity: It is the list of all small entites which can be overlapped on large entity.
        large_entity (str): It is the name of entity on which the small entitiy will be overlapped in original document.

    Returns:
        documentai.Document: It is Document proto object, which contains newly added entities as well
    """
    for i in list_of_small_entity:
        page_i = 0
        if i.page_anchor.page_refs[0].page:
            page_i = int(i.page_anchor.page_refs[0].page)
        for entity in document.entities:
            if entity.type == large_entity:
                page = 0
                if entity.page_anchor.page_refs[0].page:
                    page = int(entity.page_anchor.page_refs[0].page)
                if page == page_i:
                    entity1_coordinate, entity2_coordinate = get_entity_coordinates(
                        entity, i
                    )
                    iou = bb_intersection_over_union(
                        entity1_coordinate, entity2_coordinate
                    )
                    if iou > 0.0:
                        new_entity = split_overlapping_entities(
                            entity, i, document, page
                        )
                        new_entities.append(new_entity)
                        document.entities.remove(entity)

                        document.entities.append(new_entity)
    return document


for k in tqdm(range(0, len(list_of_files))):
    new_entities = []
    print("\nProcessing >>> ", list_of_files[k])
    document = documentai_json_proto_downloader(
        input_storage_bucket_name, list_of_files[k]
    )
    try:
        for j in list_of_pair_of_entities:
            small_entity = j[0]
            large_entity = j[1]
            list_of_small_entity = []
            for entity in document.entities:
                if entity.type == small_entity:
                    list_of_small_entity.append(entity)

            document = update_document(document, list_of_small_entity, large_entity)

    except Exception as e:
        print(
            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            + list_of_files[k]
            + " was not processed successfully!!!"
        )
        print(e)
        continue
    store_document_as_json(
        documentai.Document.to_json(document),
        output_storage_bucket_name,
        output_bucket_path_prefix + "/" + list_of_files[k].split("/")[-1],
    )