# Entity Label Restructuring Tool

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.


## Objective

This tool will first restructure the labeling, eliminating nested entities and treating them as standard entities. Subsequently, post-processing will reinstate the visual grouping originally established by the user.

## Prerequisites
* Vertex AI Notebook
* DocumentAI Parser output
* GCS bucket

## Step by Step procedure 

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
import json
from pathlib import Path
from tqdm import tqdm
from google.cloud import storage
from utilities import store_document_as_json

### 2.Setup the inputs
* `project_id` : Provide GCP project Number
* `location` : The region where the resources or services are hosted
* `processor_id` : The unique identifier of the Google Cloud Processor.
* `schema_file_path`: The Google Cloud Storage (gs) path for the schema file to store
* `export_dataset_path`: The Google Cloud Storage (gs) path for exported labeled_jsons where you wish to store

In [None]:
project_id = "project_id"
location = "us" or "eu"
processor_id = "xxxx-xxxx-xxxx"
schema_file_path = "gs://bucket_name/path_to_schema_file/"
export_dataset_path = "gs://bucket_name/path_to_output_folder/"

### 3.Run the required functions

In [None]:
def list_documents(
    project_id: str,
    location: str,
    processor: str,
    page_size: int = 100,
    page_token: str = "",
) -> documentai.types.ListDocumentsResponse:
    """
    Lists documents in a dataset for a specified Document AI processor.

    Args:
        project_id (str): The ID of the Google Cloud project.
        location (str): The location of the Document AI processor.
        processor (str): The ID of the Document AI processor.
        page_size (int, optional): The maximum number of documents to return per page. Default is 100.
        page_token (str, optional): A token for pagination to retrieve the next set of results.

    Returns:
        documentai.types.ListDocumentsResponse: A response object containing the list of documents.
    """
    client = documentai.DocumentServiceClient()
    dataset = (
        f"projects/{project_id}/locations/{location}/processors/{processor}/dataset"
    )
    request = documentai.types.ListDocumentsRequest(
        dataset=dataset,
        page_token=page_token,
        page_size=page_size,
        return_total_size=True,
    )
    operation = client.list_documents(request)
    return operation


def get_document(
    project_id: str, location: str, processor: str, doc_id: str
) -> documentai.types.Document:
    """
    Retrieves a specific document from a dataset.

    Args:
        project_id (str): The ID of the Google Cloud project.
        location (str): The location of the Document AI processor.
        processor (str): The ID of the Document AI processor.
        doc_id (str): The ID of the document to retrieve.

    Returns:
        documentai.types.Document: The retrieved document object.
    """
    client = documentai.DocumentServiceClient()
    dataset = (
        f"projects/{project_id}/locations/{location}/processors/{processor}/dataset"
    )
    request = documentai.types.GetDocumentRequest(dataset=dataset, document_id=doc_id)
    operation = client.get_document(request)
    return operation.document


def get_dataset_schema(
    project_id: str, processor_id: str, location: str
) -> documentai.types.DatasetSchema:
    """
    Retrieves the dataset schema for a specified Document AI processor.

    Args:
        project_id (str): The ID of the Google Cloud project.
        processor_id (str): The ID of the Document AI processor.
        location (str): The location of the Document AI processor.

    Returns:
        documentai.types.DatasetSchema: The dataset schema object.
    """
    # Create a client
    processor_name = (
        f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    )
    client = documentai.DocumentServiceClient()
    request = documentai.GetDatasetSchemaRequest(
        name=processor_name + "/dataset/datasetSchema"
    )
    # Make the request
    response = client.get_dataset_schema(request=request)

    return response


def remove_child_entities(json_dict: dict) -> dict:
    """
    Removes child entities from the given JSON dictionary and returns a new dictionary.

    Args:
        json_dict (dict): The JSON dictionary containing entities.

    Returns:
        dict: The modified JSON dictionary with child entities removed.
    """
    new_entities = []
    for entity in json_dict.entities:
        if entity.properties:
            new_entities.extend(entity.properties)
            # for prop in entity.properties:
            #     new_entities.append(prop)
        else:
            new_entities.append(entity)
    json_dict.entities = new_entities

    return json_dict

### 4.Run the code

In [None]:
def main():
    exported_schema = get_dataset_schema(project_id, processor_id, location)
    dict_schema = documentai.types.dataset.DatasetSchema.to_dict(exported_schema)
    store_document_as_json(
        json.dumps(dict_schema),
        schema_file_path.split("/")[2],
        ("/").join(schema_file_path.split("/")[3:]) + "/schema_file.json",
    )
    results = list_documents(project_id, location, processor_id)
    document_list = results.document_metadata
    while len(document_list) != results.total_size:
        page_token = results.next_page_token
        results = list_documents(
            project_id, location, processor_id, page_token=page_token
        )
        document_list.extend(results.document_metadata)
    print("Exporting Dataset...")
    for doc in tqdm(document_list):
        doc_id = doc.document_id
        split_type = doc.dataset_type
        if split_type == 3:
            split = "unassigned"
        elif split_type == 2:
            split = "test"
        elif split_type == 1:
            split = "train"
        else:
            split = "unknown"
        file_name = doc.display_name
        res = get_document(project_id, location, processor_id, doc_id)
        exported_path = ("/").join(export_dataset_path.split("/")[3:])
        output_file_name = f"{exported_path}/{split}/{file_name}.json"
        json_data = documentai.Document.to_json(remove_child_entities(res))
        store_document_as_json(
            json_data, export_dataset_path.split("/")[2], output_file_name
        )


main()

### 5.Output

The updated JSON files with labels will be saved in the specified output folder. Also it will save the schema form the processor.

#### Updated json files and schema from the processor stored in the Storage Bucket

<img src="./Images/Storage_path.png" width=800 height=400 ></img>

### Flatten schema of the updated json files in the DocAI Processor UI

<img src="./Images/Flatten_entities.png" width=800 height=400 ></img>

## Grouping nested entities according to schema

### 1.Importing Required Modules

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
import json
from pathlib import Path
from tqdm import tqdm
from google.cloud import storage
from utilities import (
    file_names,
    documentai_json_proto_downloader,
    store_document_as_json,
)

### 2.Setup the inputs
* `documents_path`: The Google Cloud Storage (gs) path for documents processed
* `schema_file`: The Google Cloud Storage (gs)path for the schema file
* `final_output_path`: The Google Cloud Storage (gs) path for updated labeled jsons to store

In [None]:
documents_path = "gs://bucket_name/path_to_parsed_jsons/"
schema_file = "gs://bucket_name/path_to_schema file/file_name.json"
final_output_path = "gs://bucket_name/path_to_output_folder/"

### 3.Run the required functions

In [None]:
def find_schema(schema: dict) -> dict:
    """
    Finds and organizes the schema properties from a document schema.

    Args:
        schema (dict): The document schema containing entity types and their properties.

    Returns:
        dict: A nested schema mapping entity types to their properties.
    """
    nested_schema = {}
    for schema_metadata in schema["document_schema"]["entity_types"]:
        if schema_metadata["name"] == "custom_extraction_document_type":
            if len(schema_metadata["properties"]) > 0:
                for schema_property in schema_metadata["properties"]:
                    if schema_property["name"] == schema_property["value_type"]:
                        nested_schema[schema_property["name"]] = []
        else:
            if schema_metadata["name"] in nested_schema:
                for schema_property in schema_metadata["properties"]:
                    nested_schema[schema_metadata["name"]].append(
                        schema_property["name"]
                    )
            else:
                nested_schema[chema_metadata["name"]] = []
                for schema_property in schema_metadata["properties"]:
                    nested_schema[schema_metadata["name"]].append(
                        schema_property["name"]
                    )
    return nested_schema


def get_page_bbox(entity: documentai.Document.Entity) -> list:
    """
    Retrieves the bounding box coordinates of a document entity.

    Args:
        entity (documentai.Document.Entity): The document entity for which to get the bounding box.

    Returns:
        list: A list containing the bounding box coordinates [min_x, min_y, max_x, max_y].
    """

    bound_poly = entity.page_anchor.page_refs
    norm_ver = bound_poly[0].bounding_poly.normalized_vertices
    x_values = [vertex.x for vertex in norm_ver]
    y_values = [vertex.y for vertex in norm_ver]
    bbox = [min(x_values), min(y_values), max(x_values), max(y_values)]

    return bbox


def tag_line_items(
    nested_schema: dict, json_dict: documentai.Document
) -> documentai.Document:
    """
    Tags line items in a JSON dictionary based on a nested schema.

    Args:
        nested_schema (dict): A mapping of entity types to their properties.
        json_dict (documentai.Document): The document object containing entities to be tagged.

    Returns:
        documentai.Document: The updated document object with tagged line items.
    """
    child_items = {}
    for i in range(len(json_dict.entities) - 1, -1, -1):
        entity = json_dict.entities[i]
        for parent in nested_schema:
            if entity.type in nested_schema[parent]:
                if parent in child_items:
                    child_items[parent].append(entity)
                    del json_dict.entities[i]
                    break
                else:
                    child_items[parent] = [entity]
                    del json_dict.entities[i]
                    break
    grouped_line_items = []
    for parent in child_items.keys():
        line_item_temp = {
            "mention_text": "",
            "page_anchor": {
                "page_refs": [{"bounding_poly": {"normalized_vertices": []}}]
            },
            "properties": [],
            "text_anchor": {"text_segments": []},
            "type": parent,
        }
        text_anc_temp = []
        page_anc_temp = {"x": [], "y": []}
        mt_temp = ""
        for child_1 in child_items[parent]:
            bbox_temp = get_page_bbox(child_1)
            line_item_temp["properties"].append(child_1)
            page_anc_temp["x"].extend([bbox_temp[0], bbox_temp[2]])
            page_anc_temp["y"].extend([bbox_temp[1], bbox_temp[3]])
            seg_temp = child_1.text_anchor.text_segments
            page = child_1.page_anchor.page_refs[0].page
            for seg in seg_temp:
                text_anc_temp.append(
                    {
                        "start_index": str(seg.start_index),
                        "end_index": str(seg.end_index),
                    }
                )
        if not text_anc_temp:
            continue
        else:
            sorted_data = sorted(text_anc_temp, key=lambda x: int(x["end_index"]))
            for sort_text in sorted_data:
                mt_temp = (
                    mt_temp
                    + " "
                    + json_dict.text[
                        int(sort_text["start_index"]) : int(sort_text["end_index"])
                    ]
                )
            line_item_temp["page_anchor"]["page_refs"][0]["page"] = page
            line_item_temp["text_anchor"]["text_segments"] = sorted_data
            line_item_temp["mention_text"] = mt_temp
            line_item_temp["page_anchor"]["page_refs"][0]["bounding_poly"][
                "normalized_vertices"
            ] = [
                {"x": min(page_anc_temp["x"]), "y": min(page_anc_temp["y"])},
                {"x": max(page_anc_temp["x"]), "y": min(page_anc_temp["y"])},
                {"x": max(page_anc_temp["x"]), "y": max(page_anc_temp["y"])},
                {"x": min(page_anc_temp["x"]), "y": max(page_anc_temp["y"])},
            ]
            grouped_line_items.append(line_item_temp)
    json_dict.entities.extend(grouped_line_items)
    return json_dict

### 4.Run the code

In [None]:
def main():
    storage_client = storage.Client()
    input_bucket_obj = storage_client.get_bucket(schema_file.split("/")[2])
    schema = json.loads(
        input_bucket_obj.blob(
            ("/").join(schema_file.split("/")[3:])
        ).download_as_bytes()
    )
    nested_schema = find_schema(schema)
    file_name_list, file_path_dict = file_names(documents_path)
    for i in range(len(file_name_list)):
        file_path = (
            "gs://"
            + documents_path.split("/")[2]
            + "/"
            + file_path_dict[file_name_list[i]]
        )
        print("Running on this file path : ", file_path)
        json_data = documentai_json_proto_downloader(
            file_path.split("/")[2], ("/").join(file_path.split("/")[3:])
        )
        updated_json = tag_line_items(nested_schema, json_data)
        store_document_as_json(
            documentai.Document.to_json(updated_json),
            final_output_path.split("/")[2],
            ("/").join(final_output_path.split("/")[3:]) + "/" + file_name_list[i],
        )


main()

### 5.Output

The code will modify the JSON files to incorporate nested entities as defined by the schema file. These updated files, complete with the necessary labels, will then be stored within the designated output folder.

#### Exported Schema in the DocAI Processor UI

<img src="./Images/UI_Nested_Entities.png" width=800 height=400 ></img>