# Document AI Specialized Parser (Async)
This notebook shows you how use Document AI's specialized parsers ex. Invoice, Receipt, and W9 asynchronously.

In [None]:
# Install necessary Python libraries
!python -m pip install -r ../requirements.txt

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

import re
import os
import pandas as pd
import simplejson as json

## Set your processor variables 

In [None]:
PROJECT_ID = "YOUR_PROJECT_ID_HERE"
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = "PROCESSOR_ID"  # Create processor in Cloud Console

GCS_INPUT_BUCKET = 'cloud-samples-data'
GCS_INPUT_PREFIX = 'documentai/async_invoices/'
GCS_OUTPUT_URI = 'YOUR-OUTPUT-BUCKET'
GCS_OUTPUT_URI_PREFIX = 'TEST'
TIMEOUT = 300

The following code calls the synchronous API and parses the form fields and values.

In [None]:
def process_document_sample():
    # Instantiates a client
    client_options = {"api_endpoint": "{}-documentai.googleapis.com".format(LOCATION)}
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
    storage_client = storage.Client()
    
    # Sample invoices are stored in gs://cloud-samples-data/documentai/async_invoices/
    blobs = storage_client.list_blobs(GCS_INPUT_BUCKET, prefix=GCS_INPUT_PREFIX)
    input_configs = []
    print("Input Files:")
    for blob in blobs:
        if ".pdf" in blob.name:
            source = "gs://{bucket}/{name}".format(bucket = GCS_INPUT_BUCKET, name = blob.name)
            print(source)
            input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
                gcs_source=source, mime_type="application/pdf")
            input_configs.append(input_config)

    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri
    )

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first.
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=input_configs,
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=TIMEOUT)

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        if ".json" in blob.name:
            blob_as_bytes = blob.download_as_string()
            print("downloaded")

            document = documentai.types.Document.from_json(blob_as_bytes)
            print(f"Fetched file {i + 1}")

            # For a full list of Document object attributes, please reference this page:
            # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document
            
            # Read the entities output from the processor
            types = []
            values = []
            confidence = []
            
            for entity in document.entities:
                types.append(entity.type_)
                values.append(entity.mention_text)
                confidence.append(round(entity.confidence,4))
        
            # Create a Pandas Dataframe to print the values in tabular format. 
            df = pd.DataFrame({'Type': types, 'Value': values, 'Confidence': confidence}) 
            display(df)
                
        else:
            print(f"Skipping non-supported file type {blob.name}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element.text_anchor.text_segments:
        start_index = (
            int(segment.start_index)
            if segment in doc_element.text_anchor.text_segments
            else 0
        )
        end_index = int(segment.end_index)
        response += document.text[start_index:end_index]
    return response

In [5]:
doc = process_document_sample()

downloaded
Fetched file 1


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 2


Unnamed: 0,Type,Value,Confidence
0,net_amount,$2000.00,1.0
1,invoice_date,01/01/1970,0.9812
2,due_date,01/01/2025,0.9591
3,total_tax_amount,140.00,0.9479
4,invoice_id,001,0.8444
5,payment_terms,6 month contract,0.7743
6,receiver_name,John Doe,0.7147
7,supplier_name,Company ABC,0.7109
8,currency,$,0.7049
9,supplier_address,"111 Main Street\nAnytown, USA",0.4738


downloaded
Fetched file 3


Unnamed: 0,Type,Value,Confidence
0,currency,INR,1.0
1,net_amount,-9782.20,1.0
2,total_amount,9782.00,0.9352
3,supplier_name,SAMRAT CARS PVT LTD,0.9306
4,invoice_date,18/03/2019,0.8867
5,invoice_id,RBR19B0129530,0.8339
6,supplier_tax_id,24AAFCA0924K1ZX,0.7537
7,supplier_phone,022 61933277,0.6741
8,supplier_website,www.aldautomotive.in,0.6605
9,total_tax_amount,136.14,0.5817


downloaded
Fetched file 4


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 5


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 6


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 7


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 8


Unnamed: 0,Type,Value,Confidence
0,net_amount,$2000.00,1.0
1,invoice_date,01/01/1970,0.9812
2,due_date,01/01/2025,0.9591
3,total_tax_amount,140.00,0.9479
4,invoice_id,001,0.8444
5,payment_terms,6 month contract,0.7743
6,receiver_name,John Doe,0.7147
7,supplier_name,Company ABC,0.7109
8,currency,$,0.7049
9,supplier_address,"111 Main Street\nAnytown, USA",0.4738


downloaded
Fetched file 9


Unnamed: 0,Type,Value,Confidence
0,currency,INR,1.0
1,net_amount,-9782.20,1.0
2,total_amount,9782.00,0.9352
3,supplier_name,SAMRAT CARS PVT LTD,0.9306
4,invoice_date,18/03/2019,0.8867
5,invoice_id,RBR19B0129530,0.8339
6,supplier_tax_id,24AAFCA0924K1ZX,0.7537
7,supplier_phone,022 61933277,0.6741
8,supplier_website,www.aldautomotive.in,0.6605
9,total_tax_amount,136.14,0.5817


downloaded
Fetched file 10


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 11


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 12


Unnamed: 0,Type,Value,Confidence
0,net_amount,$2000.00,1.0
1,invoice_date,01/01/1970,0.9812
2,due_date,01/01/2025,0.9591
3,total_tax_amount,140.00,0.9479
4,invoice_id,001,0.8444
5,payment_terms,6 month contract,0.7743
6,receiver_name,John Doe,0.7147
7,supplier_name,Company ABC,0.7109
8,currency,$,0.7049
9,supplier_address,"111 Main Street\nAnytown, USA",0.4738


downloaded
Fetched file 13


Unnamed: 0,Type,Value,Confidence
0,currency,INR,1.0
1,net_amount,-9782.20,1.0
2,total_amount,9782.00,0.9352
3,supplier_name,SAMRAT CARS PVT LTD,0.9306
4,invoice_date,18/03/2019,0.8867
5,invoice_id,RBR19B0129530,0.8339
6,supplier_tax_id,24AAFCA0924K1ZX,0.7537
7,supplier_phone,022 61933277,0.6741
8,supplier_website,www.aldautomotive.in,0.6605
9,total_tax_amount,136.14,0.5817


downloaded
Fetched file 14


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 15


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 16


Unnamed: 0,Type,Value,Confidence
0,net_amount,$2000.00,1.0
1,invoice_date,01/01/1970,0.9812
2,due_date,01/01/2025,0.9591
3,total_tax_amount,140.00,0.9479
4,invoice_id,001,0.8444
5,payment_terms,6 month contract,0.7743
6,receiver_name,John Doe,0.7147
7,supplier_name,Company ABC,0.7109
8,currency,$,0.7049
9,supplier_address,"111 Main Street\nAnytown, USA",0.4738


downloaded
Fetched file 17


Unnamed: 0,Type,Value,Confidence
0,currency,INR,1.0
1,net_amount,-9782.20,1.0
2,total_amount,9782.00,0.9352
3,supplier_name,SAMRAT CARS PVT LTD,0.9306
4,invoice_date,18/03/2019,0.8867
5,invoice_id,RBR19B0129530,0.8339
6,supplier_tax_id,24AAFCA0924K1ZX,0.7537
7,supplier_phone,022 61933277,0.6741
8,supplier_website,www.aldautomotive.in,0.6605
9,total_tax_amount,136.14,0.5817


downloaded
Fetched file 18


Unnamed: 0,Type,Value,Confidence
0,net_amount,"$22,379.39",1.0
1,due_date,"Sep 30, 2019",0.9958
2,invoice_date,"Sep 24, 2019",0.9681
3,total_amount,19647.68,0.9651
4,total_tax_amount,1767.97,0.9452
5,invoice_id,23413561D,0.8993
6,currency,$,0.6282
7,freight_amount,$199.99,0.371
8,receiver_address,"Jane Smith,\n1600 Amphitheatre Pkway\nMountain...",0.2417
9,purchase_order,12,0.0155


downloaded
Fetched file 19


Unnamed: 0,Type,Value,Confidence
0,net_amount,$2000.00,1.0
1,invoice_date,01/01/1970,0.9812
2,due_date,01/01/2025,0.9591
3,total_tax_amount,140.00,0.9479
4,invoice_id,001,0.8444
5,payment_terms,6 month contract,0.7743
6,receiver_name,John Doe,0.7147
7,supplier_name,Company ABC,0.7109
8,currency,$,0.7049
9,supplier_address,"111 Main Street\nAnytown, USA",0.4738


downloaded
Fetched file 20


Unnamed: 0,Type,Value,Confidence
0,currency,INR,1.0
1,net_amount,-9782.20,1.0
2,total_amount,9782.00,0.9352
3,supplier_name,SAMRAT CARS PVT LTD,0.9306
4,invoice_date,18/03/2019,0.8867
5,invoice_id,RBR19B0129530,0.8339
6,supplier_tax_id,24AAFCA0924K1ZX,0.7537
7,supplier_phone,022 61933277,0.6741
8,supplier_website,www.aldautomotive.in,0.6605
9,total_tax_amount,136.14,0.5817
