In [None]:
import re
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult

endpoint = '------------------'
doc_intelligence_key = '------------------'

input_folder = "PdfFiles"
output_folder = "TextFiles"

In [None]:
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=endpoint, 
    api_version="------------", 
    credential=AzureKeyCredential(doc_intelligence_key)
)

In [3]:
def process_and_save_text(file_path, output_path):
    try:
        with open(file_path, "rb") as f:
            poller = document_intelligence_client.begin_analyze_document(
                "prebuilt-layout", 
                analyze_request=f, 
                content_type="application/octet-stream", 
                output_content_format='markdown'
            )
            result = poller.result()
            content_per_page = []
            page_number = 1

            for page in result.pages:
                raw_content = result.content[page.spans[0]['offset']: page.spans[0]['offset'] + page.spans[0]['length']]
                cleaned_content = re.sub(r"(?<=\w)\n(?=\w)", " ", raw_content) 
                cleaned_content = re.sub(r"-\n", "", cleaned_content)  
                cleaned_content = ' '.join(cleaned_content.split())
                content_per_page.append(f"Page {page_number}\n{cleaned_content}\n")
                page_number += 1
            with open(output_path, "w", encoding="utf-8") as output_file:
                output_file.write("\n\n".join(content_per_page))       
        print(f"Processed and saved: {output_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
os.makedirs(output_folder, exist_ok=True)

existing_base_names = set()
for file in os.listdir(output_folder):
    if not (file.lower().endswith('.pdf') or file.lower().endswith('.txt')):
        base_name = os.path.splitext(file)[0]
        existing_base_names.add(base_name)

for filename in os.listdir(input_folder):
    if filename.lower().endswith(".pdf"):  
        base_name = os.path.splitext(filename)[0]
        if base_name in existing_base_names:
            print(f"Skipping '{filename}' as a matching output file exists in '{output_folder}'.")
            continue
        input_file_path = os.path.join(input_folder, filename)
        output_file_name = f"{base_name}.txt"
        output_file_path = os.path.join(output_folder, output_file_name)
        if os.path.exists(output_file_path):
            print(f"Skipping '{filename}' as '{output_file_name}' already exists in '{output_folder}'.")
            continue 
        process_and_save_text(input_file_path, output_file_path)

print("All documents processed and saved.")

Processed and saved: TextFiles\mpireport2024en.txt
All documents processed and saved.
