In [7]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.protobuf.json_format import MessageToJson
from google.cloud.documentai_toolbox import document

import json
import os
import time

In [2]:
with open("./credentials.json","r") as file:
    creds = json.loads(file.read())
    
print(creds.keys())

dict_keys(['access_id', 'secret_key', 'region', 'mfa', 'role_arn', 'azure_vision_key', 'azure_vision_endpoint', 'google_document_location', 'google_document_id', 'google_project_name'])


In [None]:
root_path = "/".join(os.getcwd().split("/")[:-1])

google_root_path = os.path.join(root_path,"googleOutput")

images_root_path = os.path.join(root_path,"imagesPerPage")

<h2>Call Google Document AI Service</h2>

In [3]:
def create_client_process(creds):
    opts = ClientOptions(api_endpoint=f"{creds['google_document_location']}-documentai.googleapis.com")
    
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    
    parent = client.common_location_path(creds["google_document_id"], creds["google_document_location"])
    
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            type_="FORM_PARSER_PROCESSOR",display_name=creds["google_project_name"]),
    )
    
    return processor,client

In [None]:
processor,client = create_client_process(creds)

In [6]:
for folder in os.listdir(images_root_path):
    print(f"Processing Folder --> {folder}", end="\n")
    
    folder_dir = os.path.join(images_root_path,folder)
    google_dir = os.path.join(google_root_path,folder,"annotation")
    
    os.makedirs(google_dir,exist_ok=True)
    
    for image_file in os.listdir(folder_dir):
        image_file_dir = os.path.join(folder_dir,image_file)
        result_save_dir = os.path.join(google_dir,image_file.replace(".png",".json"))
        
        with open(image_file_dir,"rb") as load_file:
            doc_bytes = load_file.read()
        
        raw_document = documentai.RawDocument(content=image_bytes,mime_type="image/png")
        
        request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)
        
        result = client.process_document(request=request)
        
        document = result.document
        
        serializedDocument = MessageToJson(document.__dict__["_pb"])
        
        jsonPayload = json.loads(serializedDocument)
        
        with open(result_save_dir,"w") as write_file:
            write_file.write(json.dumps(jsonPayload))

Processing Folder --> oncoextra-tnbc-ntrk-wm-sample-report_pdf
Processing Folder --> .DS_Store
Processing Folder --> Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEBchanged_pdf
Processing Folder --> F1CDx Sample Report (Lung) (copy)_pdf
Processing Folder --> F1CDx Sample Report (Lung) changed_pdf
Processing Folder --> CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_pdf
Processing Folder --> Sample-NGS-Thyroid-MTC-report_changed_pdf
Processing Folder --> Tempus-Onco_Clinical-Report-Sample_pdf
Processing Folder --> Positive-Report_pdf


<h2>Extract Google Document AI Response</h2>

In [147]:
def extract_table_kv_from_path(path,filename,save_dir):
    out = {}
    wrapped_document = document.Document.from_document_path(document_path=path)
    
    for page in wrapped_document.pages:
        for table_index, table in enumerate(page.tables):
            df = table.to_dataframe()
            
            output_filename = f"{filename}-{table_index}.csv"
            table_save_dir = os.path.join(save_dir,output_filename)
            df.to_csv(table_save_dir, index=False)
            
            headers = [i[0] for i in df.columns]
            for header in headers:
                if header != "":
                    values = [i[0] for i in df[header].values if i[0] != ""]
                    if header not in out.keys():
                        out[header] = []
                        out[header].extend(values)
                    else:
                        out[header].extend(values)
        
    for page in wrapped_document.pages:
        for field in page.form_fields:
            name = field.field_name
            value = field.field_value
            
            if name not in out.keys():
                out[name] = []
                out[name].append(value)
            else:
                out[name].append(value)
    
    return out

In [149]:
for folder in os.listdir(google_root_path):
    if folder != ".DS_Store":
        annotation_dir = os.path.join(google_root_path,folder,"annotation")
        results_dir = os.path.join(google_root_path,folder,"results")
        
        os.makedirs(results_dir,exist_ok=True)
        
        for jsonFile in os.listdir(annotation_dir):
            payload_dir = os.path.join(annotation_dir,jsonFile)
            json_table_save_dir = os.path.join(results_dir,jsonFile.split(".json")[0] + "_json_result.json")

            master = extract_table_kv_from_path(payload_dir,jsonFile,results_dir)
            
            with open(json_table_save_dir, "w") as json_save_file:
                json_save_file.write(json.dumps(master))