# 🧪 Evaluation Notebook: Flat Folder Setup

This notebook verifies:
- OCR and text extraction
- Multilingual field extraction
- Document type inference
- Output formatting

In [57]:
# ✅ Imports (flat structure)
from extractor_utils import extract_text_from_pdf
from general_extractor import extract_general_fields
from field_based_inference import infer_document_type
from credit_extractor import extract_credit_fields
from investment_extractor import extract_investment_fields
from garnishment_extractor import extract_garnishment_fields
from personal_account_extractor import extract_personal_account_fields

In [58]:
# 📄 Load document (OCR fallback built-in)
text = extract_text_from_pdf("sample_docs/doc-2.pdf")
print(text[:1500])  # Preview OCR/text output




In [59]:
import re
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
from langdetect import detect, DetectorFactory
import tempfile
def extract_text_from_pdf(file_path):
    """Extract text from PDF. Use OCR fallback if PyPDF2 fails."""
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text
        if text.strip():
            print("[INFO] Extracted text via PyPDF2.")
            return text
    except Exception as e:
        print(f"[WARN] PyPDF2 failed: {e}")

    # OCR fallback
    try:
        print("[INFO] Falling back to OCR...")
        with tempfile.TemporaryDirectory() as tmpdir:
            images = convert_from_path(file_path, dpi=300, output_folder=tmpdir)
            for i, img in enumerate(images):
                ocr_text = pytesseract.image_to_string(img, lang='deu+eng+fra+spa+ita')
                print(f"[DEBUG] Page {i+1} OCR length:", len(ocr_text))
                text += ocr_text
    except Exception as e:
        print(f"[ERROR] OCR failed: {e}")

    return text


In [60]:
text = extract_text_from_pdf("sample_docs/doc-01.pdf")
print("[OCR Preview]:", text[:1500])


[INFO] Falling back to OCR...
[DEBUG] Page 1 OCR length: 1135
[OCR Preview]: Deutsche Sparkasse KTO-DE-78901-2025

MITTEILUNG ÜBER KONTOSCHLIESSSUNG

Datum: 5. März 2025 Kontonummer: DE89 3704 0044 0532 0130 00

Name: Hannah Schmict
Anschrift: Mozartstrabe 15, 80336 München, Deutschland
Kundennummer: K-78945612

Sehr geehrte Frau Schmidt,

wir bestatigen den Eingang Inres Antrags aut Schließung Inres Kontos mit der IBAN DE89 3704
0044 0532 0130 00 zum 31. Marz 2025. Der aktuelle Kontostand betragt €2.457,83.

Wichtige Hinweise zur KontoschlieBung

* Alle Dauerauftráge und Lastschritten werden nach dem 31. Marz 2025 nicht mehr ausgeführt.
- Bitte informieren Sie Zahlungsemptänger Ober die KontoschlleBung.

- Ihre Debitkarte wird zum SchiieBungstermin deaktiviert.

* Der Zugang zum Online-Banking endet am 31. Marz 2025.

* Der Kontoauszug Zum Abschiuss wird Ihnen per Post zugesandt.

Restguthaben

Inr verblelbendes Guthaben in Hohe von €2.457,83 wird auf das von Innen angegebene Konto mi

In [61]:
from langdetect import detect
lang = detect(text)
print(lang)

de


In [62]:
fields = extract_general_fields(text, lang)
fields.update(extract_personal_account_fields(text, lang))

from pprint import pprint
pprint(fields)

{'account_number': 'DE89 3704 0044 0532 0130 00',
 'closing_balance': '2457.83 €',
 'customer_id': 'K-78945612',
 'customer_name': 'Hannah Schmict',
 'document_date': '05.03.2025',
 'document_id': 'KTO-DE-78901-2025',
 'document_type': 'unknown',
 'institution_address': 'Mozartstrabe 15, 80336 München, Deutsch1and',
 'institution_name': 'Sparkasse',
 'language': 'de',
 'transaction_number': 0}


In [63]:
fields = extract_general_fields(text, "it")
print(fields["customer_id"])

None


In [64]:
# 🔍 Infer document type
doc_type = infer_document_type(fields, text)
print("Inferred document type:", doc_type)

[DEBUG] Keyword-based fallback guess: credit
Inferred document type: credit


# 🚀 Integration Plan (Production Readiness)

### ✅ 1. Wrap `document_extractor.py` as a REST API
Use **FastAPI** to expose the pipeline as a service:

```python
from fastapi import FastAPI, File, UploadFile
from document_extractor import process_document
import tempfile

app = FastAPI()

@app.post("/extract/")
async def extract(file: UploadFile = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        content = await file.read()
        tmp.write(content)
        result = process_document(tmp.name)
    return result
```

Run with:
```bash
uvicorn main:app --reload
```

---

### ✅ 2. Log OCR raw output for debugging
In `extract_text_from_pdf()`:
```python
with open("ocr_debug_output.txt", "w", encoding="utf-8") as f:
    f.write(text)
```

---

### ✅ 3. Add batch-processing capability
Enable CLI processing of entire folder:
```python
for pdf in os.listdir(input_folder):
    if pdf.endswith(".pdf"):
        result = process_document(os.path.join(input_folder, pdf))
```

---

### ✅ 4. Add PDF validation pre-checks
Use PyPDF2 to check for corruption or encryption:
```python
reader = PdfReader(file_path)
if reader.is_encrypted:
    raise ValueError("Encrypted PDF not supported.")
```

---

### ✅ 5. Persist JSON to database or message queue
Example using MongoDB:
```python
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017/")
db = client["doc_extract"]
db["outputs"].insert_one(output)
```

---

### ✅ 6. Add monitoring and confidence hooks
Track:
- Language detected
- Document type inferred
- Field extraction completeness
- OCR quality (via pytesseract.image_to_data)

Integrate with Prometheus, Sentry, or OpenTelemetry if needed.