# Entities Sorting Script

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Purpose and Description
This document provides instructions for sorting the entities from top to bottom as it is in documents from parser output.This document provides the function taking json as input from parser and returns a sorted json.

## Prerequisites

1. Vertex AI Notebook
2. Parsed json files in GCS Folder.
3. Output folder to upload the updated json files.

## Step by Step procedure 

In [4]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

### 1. Input details

In [None]:
input_path = "gs://xxxxx/xxxxxxx"
output_path = "gs://xxxxxxx/xxxxxxxx/xxxx"

input_path : GCS Path for input json files<br>
output_path: GCS Path for output json files<br>

### 2. Output

<img src="./Images/entity_sorting_output.png" width=800 height=400 alt="Entity sorting output image">

### 3. Run the code

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from utilities import (
    documentai_json_proto_downloader,
    file_names,
    store_document_as_json,
)
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

input_path = "gs://xxxxx/xxxxxxx"
output_path = "gs://xxxxxxx/xxxxxxxx/xxxx"


def entity_ordering(document: documentai.Document) -> documentai.Document:
    """
    Function take unordered document and sort the entities in json.

    Args:
        document:documentai.Document : The original document object from gcp storage.

    Returns:
        documentai.Document: The sorted document object .
    """

    # Sorting childrens
    for entity in document.entities:
        if entity.properties:
            sorted_child = sorted(
                entity.properties,
                key=lambda x: x.page_anchor.page_refs[0]
                .bounding_poly.normalized_vertices[0]
                .x,
            )
            entity.properties = sorted_child

    entities_by_page = [[] for i in range(document.pages.__len__())]
    for i in document.entities:
        if i.properties:
            page_ref = i.properties[0].page_anchor.page_refs[0]
            if page_ref.page:
                page_number = int(page_ref.page)
            else:
                page_number = 0
        else:
            page_ref = i.page_anchor.page_refs[0]
            if page_ref.page:
                page_number = int(page_ref.page)
            else:
                page_number = 0
        entities_by_page[page_number].append(i)

        # Sorting entities
        entites_array = []

        for page in entities_by_page:
            sort_array = []
            for i in page:
                if i.properties:
                    sort_array.append(
                        [
                            i.properties[0]
                            .page_anchor.page_refs[0]
                            .bounding_poly.normalized_vertices[0]
                            .y,
                            i,
                        ]
                    )
                else:
                    sort_array.append(
                        [
                            i.page_anchor.page_refs[0]
                            .bounding_poly.normalized_vertices[0]
                            .y,
                            i,
                        ]
                    )

            sort_array.sort(key=lambda item: item[0])
            sort_array = [i[1] for i in sort_array]
            entites_array.extend(sort_array)
        document.entities = entites_array

    return document


def main():
    """
    It is a main function to call all other helper functions.

    """

    input_bucket_name = input_path.split("/")[2]
    input_prefix_path = "/".join(input_path.split("/")[3:])
    output_bucket_name = output_path.split("/")[2]
    output_prefix_path = "/".join(output_path.split("/")[3:])
    file_name_list = [
        i for i in list(file_names(input_path)[1].values()) if i.endswith(".json")
    ]
    for file_name in file_name_list:
        try:
            document_obj = documentai_json_proto_downloader(
                input_bucket_name, file_name
            )
            sorted_document_obj = entity_ordering(document_obj)
            output_file_name = f"{output_prefix_path}/{file_name.split('/')[-1]}"
            store_document_as_json(
                documentai.Document.to_json(sorted_document_obj),
                output_bucket_name,
                output_file_name,
            )
            print(f"[✓] {output_bucket_name}/{output_file_name}")
        except Exception as e:
            print(f"[x] {input_bucket_name}/{file_name} || Error : {str(e)}")
    print("\nOperation completed")


main()