# Tagging Customer Account Number

* Author: docai-incubator@google.com

# Disclaimer
This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

# Objective
The purpose of this post-processing script is to identify and tag the "customer_account_number" from OCR text, especially when the default invoice or pre-trained processor falls short. The script is tailored to capture account numbers beginning with "5" or "05", comprising either 9 or 10 digits. It allows for modification to suit different pattern requirements or to identify new entities by adjusting the pattern and tagging them accordingly.


**NOTE:** If any other number is of the same pattern as of “customer_account_number”, then that also be tagged.


# Prerequisites
* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Storage Bucket for storing input and output json files
* Permission For Google Storage and Vertex AI Notebook.


# Step by Step Procedure

# 1. Import necessary libraries

In [2]:
# # Download incubator-tools utilities module to present-working-directory
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage tqdm -q

In [2]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from tqdm import tqdm
import re
import utilities
from functools import reduce
from pathlib import Path

# Input Variables Details
**input_path**: GCS Input Path. It should contain DocAI processed output json files.  
**output_path**: GCS Output Path. The post-processed json files stored in this path. 

**NOTE**: GCS path must ends-with trailing-slash (`/`)

In [None]:
input_bucket = (
    "gs://xxxxxxxxxxxxxxxx/xxxxxxxxxxxxxxxxx/xxxxxxx/"  # Input Bucket GCS Path
)
output_bucket = "gs://xxxxxxxxxxxxx/xxxxxxxxxxxxxx/"  # Output Bucket GCS Path

In [1]:
input_storage_bucket_name = input_bucket.split("/")[2]
input_bucket_path_prefix = "/".join(input_bucket.split("/")[3:])
output_storage_bucket_name = output_bucket.split("/")[2]
output_bucket_path_prefix = "/".join(output_bucket.split("/")[3:])
pattern = r"\b05[0-9]{8}|5[0-9]{8}\b"  # Change according to your CAN pattern

# Initialize Google Cloud Storage client

In [3]:
storage_client = storage.Client()
source_bucket = storage_client.bucket(input_storage_bucket_name)
source_blob = source_bucket.list_blobs(prefix=input_bucket_path_prefix)
destination_bucket = storage_client.bucket(output_storage_bucket_name)

# Get a list of files and a dictionary of file names
list_of_files, file_name_dict = utilities.file_names(input_bucket)

In [6]:
def get_token_from_text_anchors(doc, text_anchors_check):
    """
    Extracts X and Y coordinates from text anchors in a DocumentAI document.

    Args:
        doc (documentai.Document): DocumentAI document containing text anchors.
        text_anchors_check (list): List of text anchors to check for coordinates.

    Returns:
        list: List of coordinates (A, B, C, D) and the page number.
    """
    temp_xy = {"x": [], "y": []}
    min_x = ""
    page_number = 0

    for page in range(len(doc.pages)):
        for token in doc.pages[page].tokens:
            text_anc = token.layout.text_anchor.text_segments
            for anc in text_anc:
                try:
                    start_temp = anc.start_index
                except:
                    start_temp = 0
                end_temp = anc.end_index

            for anc3 in text_anchors_check:
                start_check = anc3.start_index
                end_check = anc3.end_index + 1

            if (
                (start_temp >= start_check)
                and (end_temp <= end_check)
                and ((end_temp - start_temp) > 3)
            ):
                normalized_vertices_temp = (
                    token.layout.bounding_poly.normalized_vertices
                )
                for ver_xy in normalized_vertices_temp:
                    temp_xy["x"].append(ver_xy.x)
                    temp_xy["y"].append(ver_xy.y)
                    page_number = page

    min_x = min(temp_xy["x"])
    min_y = min(temp_xy["y"])
    max_x = max(temp_xy["x"])
    max_y = max(temp_xy["y"])

    A = {"x": min_x, "y": min_y}
    B = {"x": max_x, "y": min_y}
    C = {"x": max_x, "y": max_y}
    D = {"x": min_x, "y": max_y}

    return [A, B, C, D], page_number


def create_entity(mention_text, type_, m) -> documentai.Document.Entity:
    """
    Creates a DocumentAI entity based on mention text and type.

    Args:
        mention_text (str): Mentioned text.
        type_ (str): Type of the entity.
        m: Mention object.

    Returns:
        documentai.Document.Entity: Created entity.
    """
    entity = documentai.Document.Entity()
    entity.mention_text = mention_text
    entity.type_ = type_
    page_ref = documentai.Document.PageAnchor.PageRef()
    entity.page_anchor.page_refs.extend([page_ref])
    text_segment = documentai.Document.TextAnchor().TextSegment()
    text_segment.start_index = m.start()
    text_segment.end_index = m.end()
    entity.text_anchor.text_segments.extend([text_segment])
    return entity

# Loop through the list of files and process them

In [3]:
for i in list_of_files:
    print(i)
    doc = utilities.documentai_json_proto_downloader(
        input_storage_bucket_name, file_name_dict[i]
    )
    list_of_existing_custom_acc_num = []

    try:
        for entity in doc.entities:
            if entity.type_ and entity.type_ == "customer_account_number":
                list_of_existing_custom_acc_num.append(entity.mention_text)

        occurrences = re.finditer(pattern, doc.text)

        for m in occurrences:
            if doc.text[m.start() : m.end()] not in list_of_existing_custom_acc_num:
                entity = create_entity(
                    doc.text[m.start() : m.end()], "customer_account_number", m
                )
                try:
                    (
                        entity.page_anchor.page_refs[
                            0
                        ].bounding_poly.normalized_vertices,
                        page_number,
                    ) = get_token_from_text_anchors(
                        doc, entity.text_anchor.text_segments
                    )
                    entity.page_anchor.page_refs[0].page = page_number
                except:
                    print(
                        "Not able to find "
                        + doc.text[m.start() : m.end()]
                        + " in the OCR as a single token, so rejected."
                    )
                    continue

                doc.entities.append(entity)
                # print(entity)

        utilities.store_document_as_json(
            documentai.Document.to_json(doc),
            output_storage_bucket_name,
            output_bucket_path_prefix + "/" + i,
        )

    except:
        print("Not Able to Parse " + file_name_dict[i])

print("Completed")

# 3. Output

The post processed json field can be found in the storage path provided by the user during the script execution that is **output_bucket**.

### Input Sample
<img src="./images/CAN_input.png" width=800 height=200></img>

### Output Sample
<img src="./images/CAN_output.png" width=800 height=200></img>