In [6]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

import os
import json

In [None]:
root_path = "/".join(os.getcwd().split("/")[:-1])

azure_root_path = os.path.join(root_path,"azureOutput")

images_root_path = os.path.join(root_path,"imagesPerPage")

In [2]:
credentials_path = "./credentials.json"
with open(credentials_path,"r") as file:
    creds = json.loads(file.read())

<h2>Call Azure Form Recognizer Cloud Service</h2>

In [3]:
#https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/formrecognizer/azure-ai-formrecognizer/samples
document_analysis_client = DocumentAnalysisClient(
    endpoint=creds["azure_vision_endpoint"], credential=AzureKeyCredential(creds["azure_vision_key"])
)

In [6]:
for folder in os.listdir(images_root_path):
    if folder != ".DS_Store":
        print(f"Processing Folder --> {folder}", end="\n")

        folder_dir = os.path.join(images_root_path,folder)
        azure_dir = os.path.join(azure_root_path,folder,"annotation")

        os.makedirs(azure_dir,exist_ok=True)

        for image_file in os.listdir(folder_dir):
            image_file_dir = os.path.join(folder_dir,image_file)
            result_save_dir = os.path.join(azure_dir,image_file.replace(".png",".json"))

            with open(image_file_dir,"rb") as load_file:
                poller = document_analysis_client.begin_analyze_document("prebuilt-document", document=load_file)

            result = poller.result()
            result_json = result.to_dict()

            with open(result_save_dir,"w") as write_file:
                write_file.write(json.dumps(result_json))

Processing Folder --> oncoextra-tnbc-ntrk-wm-sample-report_pdf
Processing Folder --> Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEBchanged_pdf
Processing Folder --> F1CDx Sample Report (Lung) (copy)_pdf
Processing Folder --> F1CDx Sample Report (Lung) changed_pdf
Processing Folder --> CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_pdf
Processing Folder --> Sample-NGS-Thyroid-MTC-report_changed_pdf
Processing Folder --> Tempus-Onco_Clinical-Report-Sample_pdf
Processing Folder --> Positive-Report_pdf


<h2>Extract Azure Form Recognizer Response</h2>

In [29]:
def extract_kv_pairs(response):
    out = {}
    for item in response["key_value_pairs"]:
        key = item["key"]["content"]
        val = item["value"]["content"] if item["value"] != None  else ""
        
        out[key] = [val]
    
    return out

In [30]:
def extract_table(response):
    table_lines = ""
    for table in response["tables"]:
        current_index = 0
        for content in table["cells"]:
            if content["row_index"] == current_index:
                table_lines += content["content"] + "<sep>"
            else:
                table_lines += "\n"
                current_index += 1
                table_lines += content["content"] + "<sep>"

        table_lines += "\n\n\nEnd of table.\n"
        current_index = 0 
    
    return table_lines

In [31]:
def extract_json_from_csv(csv):    
    out = {}

    tables = csv.split("\n\n\nEnd of table.\n")[:-1]

    for table in tables:
        table_lines = table.split("\n")[:-1]
        table_cells = [i.split("<sep>")[:-1] for i in table_lines]

        headers = table_cells[0]
        header_count = len(headers)

        for row in table_cells[1:]:
            if len(row) >= header_count:
                for header, value in zip(headers, row):
                    if header not in out:
                        out[header] = []
                    out[header].append(value)

    return out

In [35]:
def combine_table_kvs(table_json,kvs):
    
    for key,value in kvs.items():
        stripped_key = key.strip()
        stripped_value = [i.strip() for i in value]
        
        if stripped_key in table_json.keys():
            table_json[stripped_key].extend(stripped_value)

        else:
            table_json[stripped_key] = []
            table_json[stripped_key].extend(stripped_value)
    
    return table_json

In [36]:
for folder in os.listdir(azure_root_path):
    if folder != ".DS_Store":
        annotation_dir = os.path.join(azure_root_path,folder,"annotation")
        results_dir = os.path.join(azure_root_path,folder,"results")
        
        os.makedirs(results_dir,exist_ok=True)
        
        for jsonFile in os.listdir(annotation_dir):
            payload_dir = os.path.join(annotation_dir,jsonFile)
            table_save_dir = os.path.join(results_dir,jsonFile.split(".json")[0] + "_table_result.csv")
            json_table_save_dir = os.path.join(results_dir,jsonFile.split(".json")[0] + "_json_result.json")

            with open(payload_dir,"r") as load_file:
                json_payload = json.loads(load_file.read())
            
            kvs = extract_kv_pairs(json_payload)
            csv = extract_table(json_payload)
            json_table = extract_json_from_csv(csv)
            
            master = combine_table_kvs(json_table,kvs)
            
            with open(table_save_dir,"w") as save_file:
                save_file.write(csv)

            with open(json_table_save_dir, "w") as json_save_file:
                json_save_file.write(json.dumps(master))