In [1]:
from gcp_functions import *
from os.path import splitext
import os
import pandas as pd

In [2]:
PATH = os.getcwd()
OUTPUT_FOLDER = "output_truist_documentai"
BANK = "Truist"

In [3]:
if not os.path.exists(os.path.join(PATH, OUTPUT_FOLDER)):
    os.mkdir(os.path.join(PATH, OUTPUT_FOLDER))

In [4]:
print(
    """Processor types:
         INVOICE_PROCESSOR
         CUSTOM_EXTRACTION_PROCESSOR
         FORM_PARSER_PROCESSOR
         OCR_PROCESSOR
         FORM_W9_PROCESSOR
         EXPENSE_PROCESSOR
         US_DRIVER_LICENSE_PROCESSOR
         US_PASSPORT_PROCESSOR
         ID_PROOFING_PROCESSOR"""
    )

Processor types:
         INVOICE_PROCESSOR
         CUSTOM_EXTRACTION_PROCESSOR
         FORM_PARSER_PROCESSOR
         OCR_PROCESSOR
         FORM_W9_PROCESSOR
         EXPENSE_PROCESSOR
         US_DRIVER_LICENSE_PROCESSOR
         US_PASSPORT_PROCESSOR
         ID_PROOFING_PROCESSOR


In [5]:
try:
    create_processor_sample(
        project_id="un-analitica-avanzada",
        location="us",
        processor_display_name="test_document_ai_form_parse",
        processor_type="FORM_PARSER_PROCESSOR"
    )
except:
    enable_processor_sample(
        project_id="un-analitica-avanzada", 
        location="us", 
        processor_id="695bea31f0e5bd8d"
    )

projects/819948058680/locations/us/operations/4290722472672428289


In [6]:
PROJECT_ID = "un-analitica-avanzada"
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = "695bea31f0e5bd8d"  # Create processor before running sample

# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE = "application/pdf"
GCS_OUTPUT_BUCKET = "gs://test_documentai_process"
GCS_OUTPUT_URI_PREFIX = "new_output_documentai_process"

INPUT_FOLDER = "new_inputs_doc"
BUCKET_NAME = "test_documentai_process"

In [7]:
subfolders = get_subfolders_names(bucket_name=BUCKET_NAME, folder_name=INPUT_FOLDER)

In [8]:
for subfolder in subfolders:
    
    if subfolder == BANK:
        
        file_names = get_files_names(bucket_name=BUCKET_NAME, 
                                     master_folder=INPUT_FOLDER,
                                     folder_name=subfolder)
        
        for idx, file_name in enumerate(file_names):
            gcs_input_uri = GCS_OUTPUT_BUCKET + "/" + INPUT_FOLDER + "/" + subfolder + "/" + file_name

            list_document = batch_process_documents(
                            project_id=PROJECT_ID,
                            location=LOCATION,
                            processor_id=PROCESSOR_ID,
                            gcs_input_uri=gcs_input_uri,
                            input_mime_type=MIME_TYPE,
                            gcs_output_bucket=GCS_OUTPUT_BUCKET,
                            gcs_output_uri_prefix=GCS_OUTPUT_URI_PREFIX
            )

            for document in list_document:
                
                header_row_values: List[List[str]] = []
                body_row_values: List[List[str]] = []

                # Input Filename without extension
                output_file_prefix = file_name.split(".pdf")[0]
                print(output_file_prefix)

                dic_tables = {}
                
                for page in document.pages:
                    
                    for index, table in enumerate(page.tables):
                        header_row_values = get_table_data(table.header_rows, document.text)
                        body_row_values = get_table_data(table.body_rows, document.text)

                        # Create a Pandas Dataframe to print the values in tabular format.
                        df = pd.DataFrame(
                            data=body_row_values,
                            columns=pd.MultiIndex.from_arrays(header_row_values),
                        )

                        print(f"Page {page.page_number} - Table {index}")
                        dic_tables["page_" + str(page.page_number) + "_table_"+ str(index)] = df
                        # print(df)
                        
                        folder = "file_" + str(idx)
                        
                        if not os.path.exists(os.path.join(PATH, OUTPUT_FOLDER, folder)):
                            os.mkdir(os.path.join(PATH, OUTPUT_FOLDER, folder))
                        
                        
                        # Save each table as a CSV file
                        output_filename = f"{output_file_prefix}_pg{page.page_number}_tb{index}.csv"
                        df.to_csv(os.path.join(PATH, OUTPUT_FOLDER, folder, output_filename), index=False)

Waiting for operation projects/819948058680/locations/us/operations/15061231872848256275 to complete...
Output files:
Fetching new_output_documentai_process/15061231872848256275/0/October, 2021 (1)-0.json
October, 2021 (1)
Page 1 - Table 0
Page 1 - Table 1
Page 1 - Table 2
Page 2 - Table 0
Waiting for operation projects/819948058680/locations/us/operations/10503702188275173891 to complete...
Output files:
Fetching new_output_documentai_process/10503702188275173891/0/October, 2021-0.json
October, 2021
Page 1 - Table 0
Page 1 - Table 1
Page 1 - Table 2
Page 2 - Table 0
Waiting for operation projects/819948058680/locations/us/operations/1277910807421158699 to complete...
Output files:
Fetching new_output_documentai_process/1277910807421158699/0/September, 2021-0.json
September, 2021
Page 1 - Table 0
Page 1 - Table 1
Page 1 - Table 2
Page 2 - Table 0


In [9]:
disable_processor_sample(
    project_id="un-analitica-avanzada",
    location="us",
    processor_id="695bea31f0e5bd8d"
)

projects/819948058680/locations/us/operations/17836418868320990590
