# Batch OCR Processing for ZANT
Przetwarza wszystkie PDF z katalogu data/ używając Azure Document Intelligence

In [1]:
!pip install azure-ai-documentintelligence python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import os
from pathlib import Path
import json
# from pathlib import Path
import asyncio
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient

load_dotenv()

endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
key = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')

print(f"Endpoint: {endpoint[:50]}...")
print(f"Key: {'SET' if key else 'MISSING'}")

Endpoint: https://di-zus-digital-support.cognitiveservices.a...
Key: SET


In [4]:
def get_files_by_dir(base_path: str | Path) -> dict[str, list[str]]:
    """Skanuje katalogi i zwraca slownik katalog -> lista plikow"""
    result = {}
    base_path = Path(base_path)
    
    for root, dirs, files in os.walk(base_path, followlinks=False):
        if files:
            result[root] = files[:]
            
    return result


def make_out_path(src_path: str | Path, out_dir: str | Path = "out", base_dir: str | Path = "data") -> Path:
    """Tworzy sciezke wyjsciowa zachowujac strukture"""
    src_path = Path(src_path)
    out_dir = Path(out_dir)
    base_dir = Path(base_dir)
    
    try:
        rel = src_path.relative_to(base_dir)
    except ValueError:
        rel = src_path.name
    
    out_path = out_dir / rel.with_suffix(rel.suffix + ".json")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    return out_path

In [5]:
async def analyze_single_pdf(file_path: str) -> dict:
    """Analizuje pojedynczy PDF"""
    client = DocumentIntelligenceClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
    )
    
    try:
        async with client:
            with open(file_path, 'rb') as f:
                poller = await client.begin_analyze_document(
                    "prebuilt-read",
                    f.read(),
                    content_type="application/pdf"
                )
                result = await poller.result()
                return result
    except Exception as e:
        print(f"Error analyzing {file_path}: {e}")
        return None

In [6]:
async def process_one(path: str, out_dir: Path, results: dict):
    """Przetwarza jeden plik"""
    if path in results:
        return
    
    print(f"Processing: {path}")
    
    result = await analyze_single_pdf(path)
    
    if result is None:
        return
    
    # Ekstraktuj dane
    data = {
        "file_path": path,
        "content": result.content if hasattr(result, 'content') else "",
        "page_count": len(result.pages) if hasattr(result, 'pages') else 0,
    }
    
    # Zapisz
    out_path = make_out_path(path, out_dir)
    
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    results[path] = str(out_path)
    print(f"Saved: {out_path}")

In [7]:
async def process_many(files_by_dir: dict, out_dir: str = "out", max_concurrency: int = 5):
    """Przetwarza wiele plikow z ograniczeniem wspolbieznosci"""
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    results = {}
    sem = asyncio.Semaphore(max_concurrency)
    
    async def _wrapped(path: str):
        async with sem:
            await process_one(path, out_dir, results)
    
    # Zbierz wszystkie sciezki
    all_paths = []
    for dir_path, filenames in files_by_dir.items():
        base = Path(dir_path)
        for name in filenames:
            if name.endswith('.pdf'):
                all_paths.append(str(base / name))
    
    print(f"Found {len(all_paths)} PDF files")
    
    # Uruchom
    tasks = [asyncio.create_task(_wrapped(p)) for p in all_paths]
    await asyncio.gather(*tasks, return_exceptions=True)
    
    print(f"\nCompleted: {len(results)}/{len(all_paths)} files")
    return results

In [9]:
# Skanuj pliki
base_dir = "data"
files_by_dir = get_files_by_dir(base_dir)

print(f"Found {len(files_by_dir)} directories")
for dir_path, files in list(files_by_dir.items())[:5]:
    print(f"  {dir_path}: {len(files)} files")

Found 111 directories
  data/wypadek 20: 4 files
  data/wypadek 66: 4 files
  data/wypadek 28: 4 files
  data/wypadek 30: 4 files
  data/wypadek 23: 4 files


In [10]:
# Uruchom przetwarzanie
results = await process_many(
    files_by_dir=files_by_dir,
    out_dir="out",
    max_concurrency=5
)

Found 445 PDF files
Processing: data/wypadek 20/wyjaśnienia poszkodowanego 20.pdf
Processing: data/wypadek 20/opinia 20.pdf
Processing: data/wypadek 20/zawiadomienie o wypadku 20.pdf
Processing: data/wypadek 20/karta wypadku 20.pdf
Processing: data/wypadek 66/Zawiadomienie o wypadku 66.pdf
Saved: out/wypadek 20/opinia 20.pdf.json
Processing: data/wypadek 66/Wyjaśnienia poszkodowanego 66.pdf
Saved: out/wypadek 20/karta wypadku 20.pdf.json
Processing: data/wypadek 66/Opinia 66.pdf
Saved: out/wypadek 20/wyjaśnienia poszkodowanego 20.pdf.json
Processing: data/wypadek 66/Karta wypadku 66.pdf
Saved: out/wypadek 20/zawiadomienie o wypadku 20.pdf.json
Processing: data/wypadek 28/zawiadomienie o wypadku 28.pdf
Saved: out/wypadek 66/Zawiadomienie o wypadku 66.pdf.json
Processing: data/wypadek 28/opinia 28.pdf
Saved: out/wypadek 66/Opinia 66.pdf.json
Processing: data/wypadek 28/wyjaśnienia poszkodowanego 28.pdf
Saved: out/wypadek 66/Karta wypadku 66.pdf.json
Processing: data/wypadek 28/karta wypa

In [None]:
# Podsumowanie
print(f"\nProcessed {len(results)} files")
print("\nFirst 5 results:")
for path, out_path in list(results.items())[:5]:
    print(f"  {Path(path).name} -> {out_path}")