# Tag line information to Layout Text Blocks

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.


## Objective

The purpose of this tool is to append line information to the output of the Layout parser by utilizing the output of Document OCR. Each text block is augmented with the detectedLanguages attribute, which contains information about line attributes that are consistent with the OCR output.

NOTE: This tool expect same exact same text-block of Layout parser should exist in Document OCR Text

## Prerequisites
* Vertex AI Notebook
* GCP DocumentAI Layout Parser & Document OCR Parser id’s
* GCS bucket

## Step by Step procedure 

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage

In [None]:
from typing import Optional, Any, List, Dict
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from google.api_core.client_options import ClientOptions
from utilities import file_names, store_document_as_json
import json

### 2.Setup the inputs

* `project_id` : A unique identifier for a Google Cloud project.
* `location` : The geographic region of the resource or operation, e.g., us-central1.
* `ocr_processor_id` : Identifier for the Optical Character Recognition (OCR) processor.
* `ocr_processor_version` : Version number of the OCR processor.
* `layout_processor_id` : Identifier for the layout processor.
* `layout_processor_version` : Version number of the layout processor.
* `input_path` : File path to the input data.
* `output_path` : File path for the processed output.

In [None]:
project_id = "xxxx-xxxx-xxxx"
location = "us" or "eu"
ocr_processor_id = "xxxx-xxxx-xxxx"
ocr_processor_version = "xxxx-xxxx-xxxx"
layout_processor_id = "xxxx-xxxx-xxxx"
layout_processor_version = "xxxx-xxxx-xxxx"
input_path = "gs://bucket_name/path_to_pdf's/"
output_path = "gs://bucket_name/path_to_output_folder/"

#### Global Variable declaration

In [None]:
layout_process_options = documentai.ProcessOptions(
    layout_config=documentai.ProcessOptions.LayoutConfig(
        chunking_config=documentai.ProcessOptions.LayoutConfig.ChunkingConfig(
            chunk_size=1000,
            include_ancestor_headings=True,
        )
    )
)

ocr_process_options = documentai.ProcessOptions(
    ocr_config=documentai.OcrConfig(
        enable_symbol=True,
        premium_features=documentai.OcrConfig.PremiumFeatures(
            enable_selection_mark_detection=True, enable_math_ocr=True
        ),
    )
)

latest_lookup_index = {}

### 3.Run the required functions

In [None]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    pdf_bytes: bytes,
    processor_version: Optional[str] = None,
    process_options: Optional[Any] = None,
) -> documentai.Document:
    """
    Process a PDF document using the Document AI processor and return the processed document.

    Parameters:
    project_id (str): The ID of the Google Cloud project.
    location (str): The location of the processor (e.g., 'us' or 'eu').
    processor_id (str): The ID of the Document AI processor.
    pdf_bytes (bytes): The bytes of the PDF file to be processed.
    processor_version (Optional[str]): The specific version of the processor (default is None).
    process_options (Optional[Any]): Additional processing options (default is None).

    Returns:
    documentai.Document: The processed document with recognized text and entities.
    """

    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    if processor_version:
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version
        )
    else:
        name = client.processor_path(project_id, location, processor_id)

    raw_document = documentai.RawDocument(
        content=pdf_bytes, mime_type="application/pdf"
    )
    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        process_options=process_options,
    )
    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)

    return result.document


def standardize_text(text: str) -> str:
    """
    Standardizes the input text by replacing newlines and certain quotation marks with simpler equivalents.

    Parameters:
    text (str): The text to be standardized.

    Returns:
    str: The standardized text with newlines replaced by spaces, and smart quotes replaced by regular quotes.
    """
    return text.replace("\n", " ").replace("“", '"').replace("”", '"')


def get_line_info(
    blocks: List[Dict[str, Any]], ocr_json: Dict[str, Any]
) -> List[Dict[str, Any]]:
    """
    Processes OCR text blocks to retrieve line-level language detection information from the provided OCR JSON data.

    Parameters:
    blocks (List[Dict[str, Any]]): A list of blocks, where each block contains text and related metadata.
    ocr_json (Dict[str, Any]): The OCR data containing text, page, and language information.

    Returns:
    List[Dict[str, Any]]: The modified list of blocks with detected languages added to the corresponding text blocks.
    """
    for block in blocks:
        page_start = int(block["pageSpan"]["pageStart"])
        if "textBlock" in block.keys():
            block_text = block["textBlock"]["text"]
            block_text = standardize_text(block_text)
            source_text = standardize_text(ocr_json["text"])
            start_index = latest_lookup_index.get(block_text, -1)
            start = source_text.find(block_text, start_index + 1)
            latest_lookup_index[block_text] = start
            end = start + len(block_text) + 1
            page = ocr_json["pages"][page_start - 1]
            detectedLanguages = []
            for line in page["lines"]:
                text_segments = line["layout"]["textAnchor"]["textSegments"][0]
                s_index, e_index = int(text_segments["startIndex"]), int(
                    text_segments["endIndex"]
                )
                if start <= s_index and e_index <= end:
                    detectedLanguages.extend(line["detectedLanguages"])
            block["textBlock"]["detectedLanguages"] = detectedLanguages
            if len(block["textBlock"]["blocks"]) != 0:
                get_line_info(block["textBlock"]["blocks"], ocr_json)
    return blocks

### 4.Run the code

In [None]:
def main():
    input_bucket = input_path.split("/")[2]
    output_bucket = output_path.split("/")[2]
    output_path_prefix = "/".join(output_path.split("/")[3:])
    _, files_dict = file_names(input_path)
    storage_client = storage.Client()
    input_bucket_obj = storage_client.get_bucket(input_bucket)
    for file_name, file_path in files_dict.items():
        pdf_bytes = input_bucket_obj.blob(file_path).download_as_bytes()
        print(f"processing...{file_name}")
        ocr_res = process_document_sample(
            project_id=project_id,
            location=location,
            processor_id=ocr_processor_id,
            pdf_bytes=pdf_bytes,
            processor_version=ocr_processor_version,
            process_options=ocr_process_options,
        )
        ocr_json_data = json.loads(documentai.Document.to_json(ocr_res))
        layout_res = process_document_sample(
            project_id=project_id,
            location=location,
            processor_id=layout_processor_id,
            pdf_bytes=pdf_bytes,
            process_options=layout_process_options,
        )
        layout_json_data = json.loads(documentai.Document.to_json(layout_res))
        # latest_lookup_index = {}
        blocks = get_line_info(
            layout_json_data["documentLayout"]["blocks"], ocr_json_data
        )
        layout_json_data["documentLayout"]["blocks"] = blocks
        output_file_name = f"{output_path_prefix}{file_name.replace('.pdf','.json')}"
        print("saving ", output_bucket, output_file_name)
        store_document_as_json(
            json.dumps(layout_json_data), output_bucket, output_file_name
        )


main()

### 5.Output

The updated JSONs containing line information will be saved to the specified output folder.

#### Updated JSON file 
<img src="./Images/Updated_JSON.png" width=800 height=400 ></img>