refactor: Updates to Document AI Python Samples (#323)

* Updated OCR Quickstart Sample Added Types to Request Creation Added ClientOptions object for type safety Simplified output code to print full text instead of paragraphs Updated Link to Document Object v1 specification Added mime_type as variable * Updates to process_document_sample - Same Updates as Quickstart Sample - Moved Imports to top of quickstart file * Updated Batch Process Example - Added typing - Use BatchProcessMetadata instead of Operation ID to get output files from GCS - Added MimeType specification - Added Alternatives for Directory Processing & Callbacks - Minor Changes to process_document/quickstart for unified style with batch * Updates to OCR Response Handling Sample - Separated Online Processing Request into function - Added explicit typing for documentai objects - Converted `.format()` to f-string - Simplified `layout_to_text()` * Updated Form Processing Sample - Updated to `v1` API - Separated processing request into function - Added explicit typing for Document AI Types - Separated `print_table_rows()` into function for modularity - Fixed Spelling error "Collumns" * Updated Specialized Processor Sample - Added Extraction of Properties (Nested Entities) and Normalized Values * Updates to Splitter/Classifier Sample - Updated to `v1` API - Changed Page Numeber Printout - (Splitter Classifiers now output all page numbers within a subdocument, instead of just the first and last) * Updated Test for process_document_sample - Added mime_type * Updated Document Quality Processor Sample - Updated to `v1` API - Moved API Call to separate function - Updated `.format()` to f-strings - Added Handling for Multiple Page Numbers per entity - Reused `page_refs_to_string()` from splitter/classifier example - Added `mime_type` as parameter * Updated Batch Processing Directory sample variable from CR comments * Added Sample Input PDF Files & Output JSON Files * Fixed Spelling Error in Invoice Parser Output filenames * Addressed Code Review Comments - Changed Copyright Year back to 2020 - Changed "property" variable to "prop" to avoid naming conflicts * Updated Client Library Requirements versions * Addressed Unit Test Failures * Re-added google-api-core to requirements.txt * Update samples/snippets/process_document_form_sample.py Co-authored-by: Anthonios Partheniou <partheniou@google.com> * Update samples/snippets/requirements.txt Co-authored-by: Anthonios Partheniou <partheniou@google.com> * Fixed "entirity" spelling error Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com>
GoogleCloudPlatform · Jul 28, 2022 · bfe4ffc · bfe4ffc
1 parent 35b59e6
commit bfe4ffc
Show file tree

Hide file tree

Showing 58 changed files with 462,989 additions and 392 deletions.
diff --git a/document_ai/snippets/batch_process_documents_sample.py b/document_ai/snippets/batch_process_documents_sample.py
@@ -16,118 +16,136 @@
 # [START documentai_batch_process_document]
 import re
 
+from google.api_core.client_options import ClientOptions
 from google.cloud import documentai_v1 as documentai
 from google.cloud import storage
 
 # TODO(developer): Uncomment these variables before running the sample.
-# project_id= 'YOUR_PROJECT_ID'
-# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
+# project_id = 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
 # processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
-# gcs_input_uri = "YOUR_INPUT_URI"
-# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
-# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
+# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
+# input_mime_type = "application/pdf"
+# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
 
 
 def batch_process_documents(
-    project_id,
-    location,
-    processor_id,
-    gcs_input_uri,
-    gcs_output_uri,
-    gcs_output_uri_prefix,
+    project_id: str,
+    location: str,
+    processor_id: str,
+    gcs_input_uri: str,
+    input_mime_type: str,
+    gcs_output_bucket: str,
+    gcs_output_uri_prefix: str,
     timeout: int = 300,
 ):
 
     # You must set the api_endpoint if you use a location other than 'us', e.g.:
-    opts = {}
-    if location == "eu":
-        opts = {"api_endpoint": "eu-documentai.googleapis.com"}
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
 
     client = documentai.DocumentProcessorServiceClient(client_options=opts)
 
-    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
-
-    gcs_documents = documentai.GcsDocuments(
-        documents=[{"gcs_uri": gcs_input_uri, "mime_type": "application/pdf"}]
+    gcs_document = documentai.GcsDocument(
+        gcs_uri=gcs_input_uri, mime_type=input_mime_type
     )
 
-    # 'mime_type' can be 'application/pdf', 'image/tiff',
-    # and 'image/gif', or 'application/json'
+    # Load GCS Input URI into a List of document files
+    gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
     input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
 
-    # Where to write results
-    output_config = documentai.DocumentOutputConfig(
-        gcs_output_config={"gcs_uri": destination_uri}
+    # NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
+    #
+    # gcs_input_uri = "gs://bucket/directory/"
+    # gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
+    # input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
+    #
+
+    # Cloud Storage URI for the Output Directory
+    destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
+
+    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
+        gcs_uri=destination_uri
     )
 
-    # Location can be 'us' or 'eu'
-    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
-    request = documentai.types.document_processor_service.BatchProcessRequest(
+    # Where to write results
+    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
+
+    # The full resource name of the processor, e.g.:
+    # projects/project_id/locations/location/processor/processor_id
+    # You must create new processors in the Cloud Console first
+    name = client.processor_path(project_id, location, processor_id)
+
+    request = documentai.BatchProcessRequest(
         name=name,
         input_documents=input_config,
         document_output_config=output_config,
     )
 
+    # BatchProcess returns a Long Running Operation (LRO)
     operation = client.batch_process_documents(request)
 
-    # Wait for the operation to finish
+    # Continually polls the operation until it is complete.
+    # This could take some time for larger files
+    # Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
+    print(f"Waiting for operation {operation.operation.name} to complete...")
     operation.result(timeout=timeout)
 
-    # Results are written to GCS. Use a regex to find
-    # output files
-    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
-    output_bucket = match.group(1)
-    prefix = match.group(2)
+    # NOTE: Can also use callbacks for asynchronous processing
+    #
+    # def my_callback(future):
+    #   result = future.result()
+    #
+    # operation.add_done_callback(my_callback)
 
-    storage_client = storage.Client()
-    bucket = storage_client.get_bucket(output_bucket)
-    blob_list = list(bucket.list_blobs(prefix=prefix))
-    print("Output files:")
+    # Once the operation is complete,
+    # get output document information from operation metadata
+    metadata = documentai.BatchProcessMetadata(operation.metadata)
 
-    for i, blob in enumerate(blob_list):
-        # If JSON file, download the contents of this blob as a bytes object.
-        if ".json" in blob.name:
-            blob_as_bytes = blob.download_as_bytes()
+    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
+        raise ValueError(f"Batch Process Failed: {metadata.state_message}")
 
-            document = documentai.types.Document.from_json(blob_as_bytes)
-            print(f"Fetched file {i + 1}")
+    storage_client = storage.Client()
+
+    print("Output files:")
+    # One process per Input Document
+    for process in metadata.individual_process_statuses:
+        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
+        # The Cloud Storage API requires the bucket name and URI prefix separately
+        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
+        if not matches:
+            print(
+                "Could not parse output GCS destination:",
+                process.output_gcs_destination,
+            )
+            continue
+
+        output_bucket, output_prefix = matches.groups()
+
+        # Get List of Document Objects from the Output Bucket
+        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)
+
+        # Document AI may output multiple JSON files per source file
+        for blob in output_blobs:
+            # Document AI should only output JSON files to GCS
+            if ".json" not in blob.name:
+                print(
+                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
+                )
+                continue
+
+            # Download JSON File as bytes object and convert to Document Object
+            print(f"Fetching {blob.name}")
+            document = documentai.Document.from_json(
+                blob.download_as_bytes(), ignore_unknown_fields=True
+            )
 
             # For a full list of Document object attributes, please reference this page:
-            # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document
+            # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
 
             # Read the text recognition output from the processor
-            for page in document.pages:
-                for form_field in page.form_fields:
-                    field_name = get_text(form_field.field_name, document)
-                    field_value = get_text(form_field.field_value, document)
-                    print("Extracted key value pair:")
-                    print(f"\t{field_name}, {field_value}")
-                for paragraph in page.paragraphs:
-                    paragraph_text = get_text(paragraph.layout, document)
-                    print(f"Paragraph text:\n{paragraph_text}")
-        else:
-            print(f"Skipping non-supported file type {blob.name}")
-
-
-# Extract shards from the text field
-def get_text(doc_element: dict, document: dict):
-    """
-    Document AI identifies form fields by their offsets
-    in document text. This function converts offsets
-    to text snippets.
-    """
-    response = ""
-    # If a text segment spans several lines, it will
-    # be stored in different text segments.
-    for segment in doc_element.text_anchor.text_segments:
-        start_index = (
-            int(segment.start_index)
-            if segment in doc_element.text_anchor.text_segments
-            else 0
-        )
-        end_index = int(segment.end_index)
-        response += document.text[start_index:end_index]
-    return response
+            print("The document contains the following text:")
+            print(document.text)
 
 
 # [END documentai_batch_process_document]
diff --git a/document_ai/snippets/batch_process_documents_sample_bad_input_test.py b/document_ai/snippets/batch_process_documents_sample_bad_input_test.py
@@ -22,8 +22,10 @@
 project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
 processor_id = "90484cfdedb024f6"
 gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+input_mime_type = "application/pdf"
 # following bucket contains .csv file which will cause the sample to fail.
 gcs_output_full_uri_with_wrong_type = "gs://documentai-beta-samples"
+gcs_output_uri_prefix = "test"
 BUCKET_NAME = f"document-ai-python-{uuid4()}"
 
 
@@ -34,8 +36,9 @@ def test_batch_process_documents_with_bad_input(capsys):
             location=location,
             processor_id=processor_id,
             gcs_input_uri=gcs_input_uri,
-            gcs_output_uri=gcs_output_full_uri_with_wrong_type,
-            gcs_output_uri_prefix="test",
+            input_mime_type=input_mime_type,
+            gcs_output_bucket=gcs_output_full_uri_with_wrong_type,
+            gcs_output_uri_prefix=gcs_output_uri_prefix,
             timeout=450,
         )
         out, _ = capsys.readouterr()

diff --git a/document_ai/snippets/batch_process_documents_sample_test.py b/document_ai/snippets/batch_process_documents_sample_test.py
@@ -25,6 +25,7 @@
 project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
 processor_id = "90484cfdedb024f6"
 gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+input_mime_type = "application/pdf"
 gcs_output_uri_prefix = uuid4()
 BUCKET_NAME = f"document-ai-python-{uuid4()}"
 
@@ -50,11 +51,12 @@ def test_batch_process_documents(capsys, test_bucket):
         location=location,
         processor_id=processor_id,
         gcs_input_uri=gcs_input_uri,
-        gcs_output_uri=f"gs://{test_bucket}",
+        input_mime_type=input_mime_type,
+        gcs_output_bucket=f"gs://{test_bucket}",
         gcs_output_uri_prefix=gcs_output_uri_prefix,
     )
     out, _ = capsys.readouterr()
 
-    assert "Extracted" in out
-    assert "Paragraph" in out
-    assert "Invoice" in out
+    assert "operation" in out
+    assert "Fetching" in out
+    assert "text:" in out