# Tag Column Number to OCR Paragraphs

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

# Objective
This tool helps to add column number(`col_num`) attribute to all paragraphs present in OCR processor results(JSON result).

# Prerequisites
* GCP Project ID
* DocumentAI Processor ID
* Cloud Storage(GCS)

# Step-by-Step Procedure

## 1. Import Modules/Packages

In [4]:
# !wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [2]:
# !pip install google-api-core
# !pip install google-cloud-docuemntai
# !pip install google-cloud-storage

In [1]:
import json
from typing import List, Optional

from google.api_core.operation import Operation
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

from utilities import file_names, store_document_as_json

## 2. Input Details

* **PROJECT_ID** : Provide GCP project ID 
* **LOCATION** : Provide DocumentAI processor location (“us” or “eu”)
* **PROCESSOR_ID** : Provide GCP DocumentAI processor id
* **PROCESSOR_VERSION_ID** : Provide GCP DocumentAI processor version id
* **GCS_INPUT_URI_RAW_PDF** : Provide GCS path of raw pdf files
* **GCS_OUTPUT_URI_JSON** :  Provide GCS path to store batch process results
* **GCS_OUTPUT_URI_PROCESSED_JSON** :  Provide GCS path to store post-processed results

In [8]:
PROJECT_ID = "xx-xx-project"
LOCATION = "us"  # or 'eu'
PROCESSOR_ID = "xx-4cb-xx-cb4-xx"
PROCESSOR_VERSION_ID = "pretrained-ocr-v2.0-2023-06-02"
GCS_INPUT_URI_RAW_PDF = "gs://bucket/path_to/input"
GCS_OUTPUT_URI_JSON = "gs://bucket/path/output/docai_results"
GCS_OUTPUT_URI_PROCESSED_JSON = "gs://bucket_path_to/output/post_processed"

## 3. Run Below Code-Cells

In [None]:
def batch_process_documents_sample(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    timeout: int = 500,
) -> Operation:
    """It will perform Batch Process on raw input documents

    Args:
        project_id (str): GCP project ID
        location (str): Processor location us or eu
        processor_id (str): GCP DocumentAI ProcessorID
        gcs_input_uri (str): GCS path which contains all input files
        gcs_output_uri (str): GCS path to store processed JSON results
        processor_version_id (str, optional): VersionID of DocumentAI Processor. Defaults to None.
        timeout (int, optional): Maximum waiting time for operation to complete. Defaults to 500.

    Returns:
        Operation: LRO operation ID for current batch-job
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    input_config = documentai.BatchDocumentsInputConfig(
        gcs_prefix=documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
    )
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config={
            "gcs_uri": gcs_output_uri,
            "sharding_config": {"pages_per_shard": 100},
        }
    )
    print("Documents are processing(batch-documents)...")
    name = (
        client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
        if processor_version_id
        else client.processor_path(project_id, location, processor_id)
    )
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )
    operation = client.batch_process_documents(request)
    print("Waiting for operation to complete...")
    operation.result(timeout=timeout)
    print("Batch process completed")
    return operation


def get_vertices(bounding_poly: documentai.BoundingPoly) -> List[float]:
    """It returns xy vertices as x&y min , x&y max

    Args:
        bounding_poly (documentai.BoundingPoly):
            Boundingpoly object which holds bbox xy coordinate values

    Returns:
        List[float]: Bbox as list xmin, ymin, xmax & ymax
    """

    x, y = [], []
    for vertex in bounding_poly.vertices:
        x.append(vertex.x)
        y.append(vertex.y)
    return [min(x), min(y), max(x), max(y)]


gcs_input_uri_raw_pdf = GCS_INPUT_URI_RAW_PDF.rstrip("/")
gcs_output_uri_json = GCS_OUTPUT_URI_JSON.rstrip("/")
gcs_output_uri_processed_json = GCS_OUTPUT_URI_PROCESSED_JSON.rstrip("/")
#  Calling Batch Process
res = batch_process_documents_sample(
    PROJECT_ID,
    LOCATION,
    PROCESSOR_ID,
    gcs_input_uri_raw_pdf.strip("/") + "/",
    gcs_output_uri_json,
    PROCESSOR_VERSION_ID,
)
print(res.metadata)
operation_id = res.operation.name.split("/")[-1]
gcs_output_uri_json = gcs_output_uri_json + f"/{operation_id}"
splits = gcs_output_uri_json.split("/")
input_bucket, input_prefix = splits[2], "/".join(splits[3:])
splits = gcs_output_uri_processed_json.split("/")
output_bucket, output_prefix = splits[2], "/".join(splits[3:])

_, files_dict = file_names(gcs_output_uri_json)


sc = storage.Client()
input_bucket_obj = sc.get_bucket(input_bucket)
output_bucket_obj = sc.get_bucket(output_bucket)
print("Postprocessing started...")
for fn, fp in files_dict.items():
    print("File: ", fn)
    # Downloading json as string
    json_string = input_bucket_obj.blob(fp).download_as_string()
    doc = documentai.Document.from_json(json_string)
    target_all = {}
    for page in doc.pages:
        target_para = {"-1": [], "1": [], "2": []}
        w, h = page.dimension.width, page.dimension.height
        x1, x2 = w, 0
        for p in page.paragraphs:
            point = get_vertices(p.layout.bounding_poly)
            x1 = min(x1, point[0])
            x2 = max(x2, point[2])
        # Midpoint of text span in a page
        mid = (x1 + x2) // 2
        for p in page.paragraphs:
            point = get_vertices(p.layout.bounding_poly)
            if point[0] < mid < point[2]:
                target_para["-1"].append(p)
            elif (point[3] - point[1]) <= 57:  # value(57) is tunable
                target_para["-1"].append(p)
            elif point[2] <= mid:
                target_para["1"].append(p)
            elif point[0] >= mid:
                target_para["2"].append(p)
        target_all[page.page_number] = target_para

    json_d = documentai.Document.to_dict(doc)
    pages_f = []
    for p in json_d["pages"]:
        mpn = p["page_number"]
        new_p = []
        for mp in p["paragraphs"]:
            _ts = mp["layout"]["text_anchor"]["text_segments"]
            if _ts:
                _si, _ei = _ts[0]["start_index"], _ts[0]["end_index"]
            else:
                continue
            for pn, values_d1 in target_all.items():
                if mpn == pn:
                    for col_num, values2 in values_d1.items():
                        for ip in values2:
                            ts_ = ip.layout.text_anchor.text_segments[0]
                            si_, ei_ = ts_.start_index, ts_.end_index
                            if int(_si) == si_ and int(_ei) == ei_:
                                mp["layout"]["col_num"] = col_num
                                new_p.append(mp)
        p["paragraphs"] = new_p
        pages_f.append(p)
    json_d["pages"] = pages_f
    json_s = json.dumps(json_d)
    store_document_as_json(json_s, output_bucket, output_prefix + "/" + fn)
    print("\t", f"File uploaded to gs://{output_bucket}/{output_prefix+'/'+fn}")

# 4. Output Details

After post processing JSON results, for each paragraph in a page a new attribute/key is added (col_num) which holds value -1 or 1 or 2. 1 and 2 represent paragraphs in the left half of page and right half of page respectively. If the text in paragraph is not satisfying a few conditions then its value is -1.

<b>Sample JSON Keys</b>  
<img src="./images/pre_processed.png" height=400 width=300 alt='pre_processed'> </img>   
<b> Postprocessed JSON Keys</b>  
<img src="./images/post_processed.png" height=400 width=300 alt='post_processed'></img> 