## This notebook covers how you can leverage Form Parser to generate annotations for Document AI Workbench.

Code uses Form Parser `form_fields` and convert them to `entities`.

- Uses Fake W-2 Dataset from [Kaggle](https://www.kaggle.com/datasets/mcvishnu1/fake-w2-us-tax-form-dataset)
- Originally created by @HSbedi87

In [None]:
# Install necessary Python libraries and restart your kernel after.
# !python -m pip install -r requirements.txt

In [None]:
import json
import re
import time
from typing import List

from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import storage

## Set your processor variables 

In [None]:
PROJECT_ID = ""
LOCATION = "us"  # Format is 'us' or 'eu'

PROCESSOR_ID = ""  # Create processor in Cloud Console
GCS_INPUT_BUCKET = ""
GCS_INPUT_PREFIX = ""
GCS_OUTPUT_URI = ""
GCS_OUTPUT_URI_PREFIX = ""
GCS_OUTPUT_ANNOTATION_BUCKET = ""
GCS_OUTPUT_ANNOTATION_URI_PREFIX = ""

TIMEOUT = 360

PDF_MIME_TYPE = "application/pdf"

The following code calls the batch API and stores response in output GCS location.

In [None]:
def process_document_from_input_file():
    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"

    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{LOCATION}-documentai.googleapis.com"
        )
    )
    name = client.processor_path(
        project=PROJECT_ID, location=LOCATION, processor=PROCESSOR_ID
    )

    storage_client = storage.Client()
    bucket = storage_client.bucket(GCS_INPUT_BUCKET)
    input_configs = []
    print("Input Files:")
    counter = 0
    api_counter = 0
    documents: List[documentai.GcsDocument] = []

    def _batch_process(
        name: str,
        input_config: documentai.BatchDocumentsInputConfig,
        destination_uri: str,
    ):
        """
        Batch process documents.
        """
        output_config = documentai.DocumentOutputConfig(
            gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
                gcs_uri=destination_uri
            )
        )
        request = documentai.BatchProcessRequest(
            name=name,
            input_documents=input_config,
            document_output_config=output_config,
        )
        print(input_config)
        operation = client.batch_process_documents(request)

    blobs = bucket.list_blobs(prefix=GCS_INPUT_PREFIX)

    for blob in blobs:
        counter = counter + 1
        source = f"gs://{GCS_INPUT_BUCKET}/{blob.name}"
        # print(source)

        if ".pdf" in source.lower():
            print(source)

            document = documentai.GcsDocument(gcs_uri=source, mime_type=PDF_MIME_TYPE)
            documents.append(document)

            gcs_documents = documentai.GcsDocuments(documents=documents)

            input_config = documentai.BatchDocumentsInputConfig(
                gcs_documents=gcs_documents
            )
            input_configs.append(input_config)

            if counter % 50 == 0:
                if api_counter >= 4:
                    api_counter = 0
                    time.sleep(TIMEOUT)

                _batch_process(name, input_config, destination_uri)

                print("process called")
                api_counter = api_counter + 1
                # print(output_config)

                # Wait for the operation to finish
                # operation.result(timeout=TIMEOUT)

                input_configs = []
                documents = []
                print(counter)

    if input_configs:
        _batch_process(name, input_config, destination_uri)
        print("process called out")

### Step1: Call Form Parser to batch process 

In [None]:
process_document_from_input_file()

In [None]:
def format_field_name(name, demiliter="_"):
    CDE_field_dict = {
        "A_EMPLOYEES_SOCIAL_SECURITY_NUMBER": "EMPL_SSN",
        "B_EMPLOYER_IDENTIFICATION_NUMBER": "EMPLR_ID_NUMBER",
        "C_EMPLOYERS_NAME_ADDRESS_AND_ZIP_CODE": "EMPLR_NAME_ADDRESS",
        "D_CONTROL_NUMBER": "CONTROL_NUMBER",
        "1_WAGES_TIPS_OTHER_COMPENSATION": "WAGES_TIPS_OTHER_COMP",
        "2_FEDERAL_INCOME_TAX_WITHHELD": "FEDERAL_INCOME_TAX_WH",
        "3_SOCIAL_SECURITY_WAGES": "SS_WAGES",
        "4_SOCIAL_SECURITY_TAX_WITHHELD": "SS_TAX_WH",
    }
    ignore_chars_list = [",", "'"]
    name = name.strip()
    name = name.upper()
    for char in ignore_chars_list:
        name = name.replace(char, "")

    split_name_str = name.split()

    cde_name = demiliter.join(split_name_str)

    if cde_name in CDE_field_dict:
        return CDE_field_dict[cde_name]
    else:
        return None


def create_entity(
    form_field_name, form_field_value, form_textSegments, form_boundingPoly
):
    entity_field_name = format_field_name(form_field_name)

    if entity_field_name:
        entity = {}
        entity["mentionText"] = form_field_value
        entity["type"] = entity_field_name

        normalizedVertices = []
        for vertex in form_boundingPoly.normalized_vertices:
            x = vertex.x
            y = vertex.y
            normalizedVertices.append({"x": x, "y": y})

        pageRefs = []
        pageRefs.append({"boundingPoly": {"normalizedVertices": normalizedVertices}})
        entity["pageAnchor"] = {"pageRefs": pageRefs}

        textSegments = []
        for segment in form_textSegments:
            textSegments.append(
                {"endIndex": segment.end_index, "startIndex": segment.start_index}
            )
        entity["textAnchor"] = {
            "content": form_field_value,
            "textSegments": textSegments,
        }

        return entity

    else:
        return None


def entity_from_formfield(form_field):
    field_name = form_field.field_name.text_anchor.content
    field_value = form_field.field_value.text_anchor.content
    boundingPoly = form_field.field_value.bounding_poly
    textSegments = form_field.field_value.text_anchor.text_segments
    entity = create_entity(field_name, field_value, textSegments, boundingPoly)
    return entity


def generate_entities_from_form_fields(document):
    entities = None
    for page in document.pages:
        for form_field in page.form_fields:
            entity = entity_from_formfield(form_field)
            # print(entity)
            if entity:
                if not entities:
                    entities = []
                entities.append(entity)
    print(entities)
    return entities


def parse_sample_files_in_gcsbucket_mod():
    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"
    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        lineindex = -1
        if ".json" in blob.name:
            match = re.match(r"(.+)-(\d).json", blob.name.split("/")[-1])
            output_file_name = match.group(1)
            print(output_file_name)
            blob_as_bytes = blob.download_as_string()
            print("downloaded")

            document = documentai.types.Document.from_json(blob_as_bytes)
            document_json = json.loads(blob_as_bytes)
            print(f"Fetched file {i + 1}")
            entities = generate_entities_from_form_fields(document)
            document_json["entities"] = entities

            create_json(document_json, output_file_name)


def create_json(json_object, filename):
    """
    this function will create json object in
    google cloud storage
    """
    # create a blob
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(GCS_OUTPUT_ANNOTATION_BUCKET)
    blob = bucket.blob
    blob = bucket.blob(GCS_OUTPUT_ANNOTATION_URI_PREFIX + filename + ".json")
    # upload the blob
    blob.upload_from_string(
        data=json.dumps(json_object), content_type="application/json"
    )
    result = filename + " upload complete"
    return {"response": result}

### Step2: Read output json from Form parser to generate `Document` for Workbench.

---



In [None]:
parse_sample_files_in_gcsbucket_mod()

W2_XL_input_clean_2950
downloaded
Fetched file 2
[{'mentionText': '66584.46\n', 'type': 'SS_WAGES', 'pageAnchor': {'pageRefs': [{'boundingPoly': {'normalizedVertices': [{'x': 0.5773606300354004, 'y': 0.5920879244804382}, {'x': 0.6581342220306396, 'y': 0.5920879244804382}, {'x': 0.6581342220306396, 'y': 0.6026373505592346}, {'x': 0.5773606300354004, 'y': 0.6026373505592346}]}}]}, 'textAnchor': {'content': '66584.46\n', 'textSegments': [{'endIndex': 2131, 'startIndex': 2122}]}}, {'mentionText': '56081.18\n', 'type': 'WAGES_TIPS_OTHER_COMP', 'pageAnchor': {'pageRefs': [{'boundingPoly': {'normalizedVertices': [{'x': 0.5773606300354004, 'y': 0.565274715423584}, {'x': 0.6564277410507202, 'y': 0.565274715423584}, {'x': 0.6564277410507202, 'y': 0.5758242011070251}, {'x': 0.5773606300354004, 'y': 0.5758242011070251}]}}]}, 'textAnchor': {'content': '56081.18\n', 'textSegments': [{'endIndex': 2059, 'startIndex': 2050}]}}, {'mentionText': '5093.71\n', 'type': 'SS_TAX_WH', 'pageAnchor': {'pageRefs'