# Line Item Improver Using Column Data

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This tool is intended as a guide to help for tagging the column data in the line item to improve the Model F1-Score

## Prerequisites
* Python : Jupyter Notebook (Vertex).
* Storage Bucket for storing exported json files and output JSON files.
* Permission For Google DocAI Processors, Storage and Vertex AI Notebook.


## Step by Step Procedure

### 1. Import Modules/Packages

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
import json
from pathlib import Path
from tqdm import tqdm
from google.cloud import storage
from typing import List, Dict, Any, Tuple

from utilities import file_names, blob_downloader, bbox_maker, store_document_as_json

### 2. Input Details

* **gcs_input_path**: Provide the gcs path of the parent folder where the sub-folders contain input files. Please follow the folder structure described earlier.
* **output_jsons**: Provide gcs path where the output json files have to be saved

In [None]:
gcs_input_path = "gs://<<bucket_name>>/<<sub_folder>>/"
output_jsons = "gs://<<bucket_name>>/<<output_sub_folder>>/"

### 3.Run the required functions

In [None]:
def column_item_child_entities_coordinates(entity: Dict[str, Any]) -> List[List[float]]:
    """
    Extracts bounding box coordinates for all child entities within a parent entity.

    Args:
    - entity (Dict[str, Any]): A dictionary representing the parent entity, containing its properties.

    Returns:
    - List[List[float]]: A list of bounding box coordinates for each child entity,
      where each bounding box is represented as [min_x, min_y, max_x, max_y].
    """
    all_entity_coordinates = []
    for i in entity["properties"]:
        bounding_poly = i["pageAnchor"]["pageRefs"][0]["boundingPoly"][
            "normalizedVertices"
        ]
        entity_coordinates = bbox_maker(bounding_poly)
        all_entity_coordinates.append(entity_coordinates)
    return all_entity_coordinates


def get_token_xy(token: Any) -> Tuple[float, float, float, float]:
    """
    Extracts the normalized bounding box coordinates (min_x, min_y, max_x, max_y) of a token.

    Args:
    - token (Any): A token object with layout information.

    Returns:
    - Tuple[float, float, float, float]: The normalized bounding box coordinates.
    """
    vertices = token.layout.bounding_poly.normalized_vertices
    minx_token, miny_token = min(point.x for point in vertices), min(
        point.y for point in vertices
    )
    maxx_token, maxy_token = max(point.x for point in vertices), max(
        point.y for point in vertices
    )
    return minx_token, miny_token, maxx_token, maxy_token


def get_token_data(
    json_dict: Any,
    min_x: float,
    max_x: float,
    min_y: float,
    max_y: float,
    page_num: int,
) -> Tuple[str, List[Dict[str, int]], List[Dict[str, float]]]:
    """
    Extracts token data from the JSON dictionary based on provided bounding box coordinates and page number.

    Args:
    - json_dict (Any): The JSON dictionary containing token data.
    - min_x (float): Minimum x-coordinate of the bounding box.
    - max_x (float): Maximum x-coordinate of the bounding box.
    - min_y (float): Minimum y-coordinate of the bounding box.
    - max_y (float): Maximum y-coordinate of the bounding box.
    - page_num (int): Page number.

    Returns:
    - Tuple[str, List[Dict[str, int]], List[Dict[str, float]]]: A tuple containing:
        1. The extracted text from the tokens.
        2. A list of dictionaries containing text anchor data for each token.
        3. A list of dictionaries containing bounding box data.
    """
    text_anc_temp = []
    text_anc = []
    text_anchor = []
    page_anc_temp = {"x": [], "y": []}
    y_allowance = 0.001
    x_allowance = 0.01
    for page in json_dict.pages:
        if page_num == page.page_number - 1:
            for token in page.tokens:
                minx_token, miny_token, maxx_token, maxy_token = get_token_xy(token)
                if (
                    min_y <= miny_token + y_allowance
                    and max_y >= maxy_token - y_allowance
                    and min_x <= minx_token + x_allowance
                    and max_x >= maxx_token - x_allowance
                ):
                    temp_anc = token.layout.text_anchor.text_segments[0]
                    text_anc.append(temp_anc)
                    page_anc_temp["x"].extend([minx_token, maxx_token])
                    page_anc_temp["y"].extend([miny_token, maxy_token])
                    for seg in token.layout.text_anchor.text_segments:
                        text_anc_temp.append([seg.start_index, seg.end_index])
                        text_anchor.append(
                            {"endIndex": seg.end_index, "startIndex": seg.start_index}
                        )
    if page_anc_temp != {"x": [], "y": []}:
        page_anc = [
            {"x": min(page_anc_temp["x"]), "y": min(page_anc_temp["y"])},
            {"x": max(page_anc_temp["x"]), "y": min(page_anc_temp["y"])},
            {"x": min(page_anc_temp["x"]), "y": max(page_anc_temp["y"])},
            {"x": max(page_anc_temp["x"]), "y": max(page_anc_temp["y"])},
        ]
    if text_anc_temp:
        sorted_data = sorted(text_anc_temp, key=lambda x: x[0])
        mention_text = ""
        for start_index, end_index in sorted_data:
            mention_text += json_dict.text[start_index:end_index]
        return mention_text, text_anchor, page_anc


def update_parent_entity(
    entity: Dict[str, Any],
    token: Tuple[str, List[Dict[str, int]], List[Dict[str, float]]],
) -> Dict[str, Any]:
    """
    Updates a parent entity with new token data.

    Args:
    - entity (Dict[str, Any]): The parent entity to be updated.
    - token (Tuple[str, List[Dict[str, int]], List[Dict[str, float]]]): Token data to be added.

    Returns:
    - Dict[str, Any]: The updated parent entity.
    """
    mention_text, text_anc, page_anc = token
    page_num = "0"
    if "page" in entity["properties"][-1]["pageAnchor"]["pageRefs"][0]:
        page_num = entity["properties"][-1]["pageAnchor"]["page"]
    new_entity = {
        "confidence": 1,
        "mentionText": mention_text,
        "pageAnchor": {
            "pageRefs": [
                {"boundingPoly": {"normalizedVertices": page_anc}, "page": page_num}
            ]
        },
        "textAnchor": {"content": mention_text, "textSegments": text_anc},
        "type": entity["properties"][-1]["type"],
    }
    entity["properties"].append(new_entity)
    entity["mentionText"] += " " + mention_text
    entity["textAnchor"]["content"] += mention_text
    entity["textAnchor"]["textSegments"].extend(text_anc)
    parent_bbox = bbox_maker(
        entity["pageAnchor"]["pageRefs"][0]["boundingPoly"]["normalizedVertices"]
    )
    child_bbox = bbox_maker(page_anc)
    max_x = max(parent_bbox[2], child_bbox[2])
    max_y = max(parent_bbox[3], child_bbox[3])
    min_x = min(parent_bbox[0], child_bbox[0])
    min_y = min(parent_bbox[1], child_bbox[1])
    entity["pageAnchor"]["pageRefs"][0]["boundingPoly"]["normalizedVertices"] = [
        {"x": max_x, "y": max_y},
        {"x": max_x, "y": min_y},
        {"x": min_x, "y": min_y},
        {"x": min_x, "y": max_y},
    ]
    return entity

### 4.Run the code

In [None]:
if __name__ == "__main__":
    gs_file_name = list(file_names(gcs_input_path)[1].values())
    for i in gs_file_name:
        bucket_name = gcs_input_path.split("/")[2]
        file_name = i.split("/")[-1]
        json_data = blob_downloader(bucket_name, i)
        json_dict = documentai.Document.from_json(json.dumps(json_data))
        entities = []
        for j in range(len(json_data["entities"])):
            # print("Before",len(json_data["entities"][j]["properties"]))
            page_number = 0
            all_entity_coordinates = column_item_child_entities_coordinates(
                json_data["entities"][j]
            )  # [min(x_list), min(y_list), max(x_list), max(y_list)]
            sorted_entity_coordinates = sorted(
                all_entity_coordinates, key=lambda x: x[3]
            )
            if "page" in json_data["entities"][j]["pageAnchor"]["pageRefs"][0].keys():
                page_number = int(
                    json_data["entities"][j]["pageAnchor"]["pageRefs"][0]["page"]
                )
            for k in range(len(sorted_entity_coordinates) - 1):
                entity_bbox = bbox_maker(
                    json_data["entities"][j]["pageAnchor"]["pageRefs"][0][
                        "boundingPoly"
                    ]["normalizedVertices"]
                )
                min_x = min(entity_bbox[0], entity_bbox[2])
                max_x = max(entity_bbox[0], entity_bbox[2])
                token_data = get_token_data(
                    json_dict,
                    min_x,
                    max_x,
                    sorted_entity_coordinates[k][3],
                    sorted_entity_coordinates[k + 1][1],
                    page_number,
                )
                if token_data != None:
                    # print(token_data)
                    json_data["entities"][j] = update_parent_entity(
                        json_data["entities"][j], token_data
                    )
            # print("After",len(json_data["entities"][j]["properties"]))
            entities.append(json_data["entities"][j])
        json_data["entities"] = entities
        store_document_as_json(
            json.dumps(json_data),
            output_jsons.split("/")[2],
            ("/").join(output_jsons.split("/")[3:]) + "new_" + file_name,
        )
        print("updated Json")
    print("Done")

### Output Details

### Before Tagging
<img src='./images/before.png' width=600 height=600 alt="Sample Output"></img>
### After Tagging
<img src='./images/after.png' width=600 height=600 alt="Sample Output"></img>