# Doc AI Synonyms Entity Tag

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This tool uses parsed json files and a dictionary with key as entity names and values as synonyms for which the entity has to be tagged. New entities added to the json. 

Approach: The values of the dictionary are searched in the OCR text and tagged with entity name based on key. 


## Prerequisite

* Vertex AI Notebook
* Parsed json files in GCS Folder
* Output folder to upload the updated json files


## Step by Step procedure

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [41]:
from google.cloud import storage
from tqdm import tqdm
from google.cloud import documentai_v1beta3 as documentai
import json
import utilities

### 2.Setup the Inputs

* `project_id`: It is the project id of the project.
* `gcs_input_path`: GCS Storage name. It should contain DocAI processed output json files. This bucket is used for processing input files and saving output files in the folders.
* `gcs_output_path`: GCS URI of the folder, where the output is stored.
* `synonyms_entities`:A dictionary with key as entity names and values as synonyms for which the entity has to be tagged.

In [None]:
# input details
project_id = "xxxx-xxxx-xxxx"
gcs_input_path = "gs://xxxx/xxxx/xxx/"
gcs_output_path = "gs://xxxx/xxxx/xxx/"
synonyms_entities = {
    "cust_name": ["ROsweLL PARK MEMORIAL", "inst"],
    "Name": ["name", "firstname", "lastname", "middlename"],
}

### 3.Run the Code

In [44]:
def get_normalizedvertices(normalized_vertices: object) -> tuple:
    """
    Get the minimum and maximum coordinates from a list of normalized vertices.

    Args:
        normalized_vertices (object) : List of normalized vertices.
    Returns:
        tuple: Minimum x, Minimum y, Maximum x, Maximum y coordinates.
    """

    min_x = min(vertex.x for vertex in normalized_vertices.normalized_vertices)
    min_y = min(vertex.y for vertex in normalized_vertices.normalized_vertices)
    max_x = max(vertex.x for vertex in normalized_vertices.normalized_vertices)
    max_y = max(vertex.y for vertex in normalized_vertices.normalized_vertices)

    return min_x, min_y, max_x, max_y


def get_token(json_dict: object, page: str, text_anchors_check: list) -> tuple:
    """THIS FUNCTION USED LOADED JSON, PAGE NUMBER AND TEXT ANCHORS AS INPUT AND GIVES THE X AND Y COORDINATES

     Args:
         json_dict (object) : The document object containing entities.
         page (str) : The page number as a string where these entities are found.
         text_anchors_check (list) : The list contains text anchors information which need to be checked.
    Returns:
         A tuple with three elements : A dictionary with keys 'min_x', 'min_y', 'max_x', and 'max_y' ; list containing textanchors ; confidence
    """
    min_x = ""
    temp_text_anc = []
    temp_confidence = []
    temp_ver = {"x": [], "y": []}
    for token in json_dict.pages[page].tokens:
        if not token.layout.text_anchor.text_segments[0].start_index:
            token.layout.text_anchor.text_segments[0].start_index = 0
        token_anc = token.layout.text_anchor.text_segments[0]
        if token.layout.text_anchor.text_segments == text_anchors_check:
            normalized_vertices = token.layout.bounding_poly
            min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)
            text_anc_token = token.layout.text_anchor.text_segments
            confidence = token.layout.confidence
        elif (
            int(token_anc.start_index) >= int(text_anchors_check[0]["start_index"]) - 2
            and int(token_anc.end_index) <= int(text_anchors_check[0]["end_index"]) + 2
            and abs(int(token_anc.start_index) - int(token_anc.end_index)) > 2
        ):
            normalized_vertices = token.layout.bounding_poly
            min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)
            temp_ver["x"].extend([min_x, max_x])
            temp_ver["y"].extend([min_y, max_y])
            text_anc_token = token.layout.text_anchor.text_segments
            for an1 in text_anc_token:
                temp_text_anc.append(an1)
            confidence = token.layout.confidence
            temp_confidence.append(confidence)

    if min_x == "":
        for token in json_dict.pages[page].tokens:
            if not token.layout.text_anchor.text_segments[0].start_index:
                token.layout.text_anchor.text_segments[0].start_index = 0

            if (
                abs(
                    int(token.layout.text_anchor.text_segments[0].start_index)
                    - int(text_anchors_check[0]["start_index"])
                )
                <= 2
                and abs(
                    int(token.layout.text_anchor.text_segments[0].end_index)
                    - int(text_anchors_check[0]["end_index"])
                )
                <= 2
            ):
                normalized_vertices = token.layout.bounding_poly
                min_x, min_y, max_x, max_y = get_normalizedvertices(normalized_vertices)
                text_anc_token = token.layout.text_anchor.text_segments
                confidence = token.layout.confidence
    if len(temp_text_anc) != 0:
        final_ver = {
            "min_x": min(temp_ver["x"]),
            "min_y": min(temp_ver["y"]),
            "max_x": max(temp_ver["x"]),
            "max_y": max(temp_ver["y"]),
        }
        final_confidence = min(temp_confidence)
        final_text_anc = sorted(temp_text_anc, key=lambda x: x.end_index)
        return final_ver, final_text_anc, final_confidence
    else:
        return (
            {"min_x": min_x, "min_y": min_y, "max_x": max_x, "max_y": max_y},
            text_anc_token,
            confidence,
        )


def synonym_entities(json_dict: object, Synonyms_entities: dict) -> object:
    """
    Find synonym entities in the loaded JSON and add them to the entities list.

    Args:
        json_dict (object): Loaded JSON dictionary.
        Synonyms_entities (dict): Dictionary of synonym entities.
    Returns:
        object: Updated JSON dictionary with added entities.
    """

    def find_substring_indexes(text: str, substring: str) -> list:
        """
        Find the start and end indices of all occurrences of a substring in the given text.

        Args:
            text (str): The text to search in.
            substring (str): The substring to find.

        Returns:
            List: A list of tuples containing start and end indices of substring occurrences.
        """
        import re

        if " " or "\n" not in substring:
            pattern = re.compile(re.escape(substring), re.IGNORECASE)
            matches = [(match.start(), match.end()) for match in pattern.finditer(text)]
        else:
            pattern = re.compile(
                r"{}.*{}".format(
                    re.escape(substring.split(" ")[0]),
                    re.escape(substring.split(" ")[-1]),
                ),
                re.IGNORECASE,
            )
            matches = [
                (match.start(), match.end())
                for match in pattern.finditer(json_dict.text)
            ]

        return matches

    def create_ent(
        ent_type: str, min_xy: dict, text_anc: list, page: str, confidence: float
    ) -> dict:
        """
        Create an entity dictionary.

        Args:
            ent_type (str): The type of the entity.
            min_xy (Dict[str, int]): Dictionary containing minimum x, y coordinates of the bounding box.
            text_anc (List): List of text segments.
            page (str): Page number.
            confidence (float): Confidence score.

        Returns:
            Dict: The created entity dictionary.
        """
        final_mention_text = ""
        for index1 in text_anc:
            final_mention_text += json_dict.text[
                int(index1.start_index) : int(index1.end_index)
            ]
        min_x = min_xy["min_x"]
        min_y = min_xy["min_y"]
        max_x = min_xy["max_x"]
        max_y = min_xy["max_y"]
        new_ent = {
            "confidence": confidence,
            "mention_text": final_mention_text,
            "page_anchor": {
                "page_refs": [
                    {
                        "bounding_poly": {
                            "normalized_vertices": [
                                {"x": min_x, "y": min_y},
                                {"x": min_x, "y": max_y},
                                {"x": max_x, "y": min_y},
                                {"x": max_x, "y": max_y},
                            ]
                        },
                        "page": page,
                    }
                ]
            },
            "text_anchor": {"text_segments": text_anc},
            "type": ent_type,
        }
        return new_ent

    new_entities = []
    for key, value in Synonyms_entities.items():
        for syn in value:
            match_indexes = find_substring_indexes(json_dict.text, syn)
            for match in match_indexes:
                print(match)
                if len(match) > 1:
                    for page in range(len(json_dict.pages)):
                        temp = json_dict.pages[page].layout.text_anchor.text_segments
                        if not temp[0].start_index:
                            temp[0].start_index = 0
                        if match[0] >= int(temp[0].start_index) and match[1] < int(
                            temp[0].end_index
                        ):
                            try:
                                min_xy, text_anc, confidence = get_token(
                                    json_dict,
                                    page,
                                    [{"start_index": match[0], "end_index": match[1]}],
                                )
                                new_ent = create_ent(
                                    key, min_xy, text_anc, page, confidence
                                )
                                new_entities.append(new_ent)
                            except Exception as e:
                                print(e)
    if len(new_entities) > 0:
        for ent1 in new_entities:
            json_dict.entities.append(ent1)

    return json_dict

In [None]:
file_names_list, file_dict = utilities.file_names(gcs_input_path)
for filename, filepath in tqdm(file_dict.items(), desc="Progress"):
    print(">>>>>>>>>>>>>>> Processing File : ", filename)
    input_bucket_name = gcs_input_path.split("/")[2]
    if ".json" in filepath:
        json_dict = utilities.documentai_json_proto_downloader(
            input_bucket_name, filepath
        )
        json_dict_updated = synonym_entities(json_dict, synonyms_entities)
        output_bucket_name = gcs_output_path.split("/")[2]
        output_path_within_bucket = "/".join(gcs_output_path.split("/")[3:]) + filename
        utilities.store_document_as_json(
            documentai.Document.to_json(json_dict_updated),
            output_bucket_name,
            output_path_within_bucket,
        )

# 4.Output Details

The output jsons files will be stored in the given output directory.