# 📘 03_Werkvertrag_OCR_PDF_Erzeugen.ipynb

In [2]:
# 📘 03_Werkvertrag_OCR_PDF_Erzeugen.ipynb

# Schritt 1: Benötigte Bibliotheken importieren
import pytesseract
from pdf2image import convert_from_bytes
from IPython.display import display, Markdown
import ipywidgets as widgets
from PIL import Image
from PyPDF2 import PdfMerger, PdfReader
import tempfile
import os
from tqdm.notebook import tqdm
from io import BytesIO
import shutil
import subprocess

# Schritt 2: Datei-Upload-Widget anzeigen
upload = widgets.FileUpload(accept='.pdf', multiple=False)

# UI anzeigen
display(Markdown("## 📂 Raster-PDF (z. B. Werkvertrag) hochladen:"))
display(upload)

# Schritt 3: PDF in durchsuchbares PDF umwandeln mit Tesseract-PDF (kompakt & durchsuchbar)
def convert_to_searchable_pdf(uploaded_file):
    uploaded_data = uploaded_file.value
    if not uploaded_data:
        return None

    file_info = uploaded_data[0]
    file_content = file_info['content']
    original_filename = file_info['name'].rsplit('.', 1)[0]
    file_name = original_filename + "_durchsuchbar.pdf"

    original_pdf = PdfReader(BytesIO(file_content))
    total_pages = len(original_pdf.pages)

    with tempfile.TemporaryDirectory() as tmpdir:
        ocr_paths = []
        progress = tqdm(total=total_pages, desc="🔄 OCR Verarbeitung", unit="Seite")

        for i in range(total_pages):
            images = convert_from_bytes(file_content, dpi=300, first_page=i+1, last_page=i+1)
            img = images[0]
            image_path = os.path.join(tmpdir, f"page_{i+1}.jpg")
            img.save(image_path, format='JPEG')

            output_base = os.path.join(tmpdir, f"ocr_page_{i+1}")
            subprocess.run([
                'tesseract',
                image_path,
                output_base,
                '-l', 'deu',
                'pdf'
            ], check=True)

            output_path = output_base + '.pdf'
            ocr_paths.append(output_path)
            progress.update(1)

        progress.close()

        merger = PdfMerger()
        for pdf in ocr_paths:
            merger.append(pdf)

        output_path = os.path.join(os.getcwd(), file_name)
        merger.write(output_path)
        merger.close()

    return output_path

# Schritt 4: Button + Ausgabe anzeigen
button = widgets.Button(description="✅ OCR-PDF erzeugen")
output = widgets.Output()

def on_click_convert(b):
    with output:
        output.clear_output()
        if not upload.value:
            print("Bitte zuerst eine Datei hochladen.")
            return

        display(Markdown("### ⏳ Verarbeitung gestartet..."))
        out_pdf = convert_to_searchable_pdf(upload)
        if out_pdf:
            display(Markdown(f"### 🔍 Fertig! Deine durchsuchbare PDF wurde erzeugt: **{out_pdf}**"))
        else:
            print("Fehler beim Erzeugen der durchsuchbaren PDF.")

button.on_click(on_click_convert)
display(button, output)


## 📂 Raster-PDF (z. B. Werkvertrag) hochladen:

FileUpload(value=(), accept='.pdf', description='Upload')

BoundedIntText(value=1, description='Startseite:', min=1)

BoundedIntText(value=0, description='Endseite (0 = alle):')

Dropdown(description='Seitenformat:', options=(('Originalgröße beibehalten', 'original'), ('DIN A4', (595.2755…

Button(description='✅ OCR-PDF erzeugen', style=ButtonStyle())

Output()