In [None]:
import os
import re
import base64
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import (
    Image,
    CompositeElement,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.openai import (
    ChatOpenAI,
    OpenAIEmbeddings,
)
from langchain.schema.document import Document
from logging import FileHandler, StreamHandler
import logging

from tenacity import (
    retry,
    wait_exponential,
    stop_after_attempt,
    retry_if_exception_type,
)


# Logging setup
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

ch = StreamHandler()
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
logger.addHandler(ch)

fh = FileHandler("logs/pdf_processing.log", mode="a")
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
logger.addHandler(fh)

# Initialize the proxy client and models
proxy_client = get_proxy_client("gen-ai-hub")

model = ChatOpenAI(proxy_model_name="gpt-4o", proxy_client=proxy_client, temperature=0)
embedding_model = OpenAIEmbeddings(
    proxy_model_name="text-embedding-3-small", proxy_client=proxy_client
)

In [None]:
class CONFIG:
    OBJECT_STORAGE: str = "pdfs"

config = CONFIG()

In [33]:
def get_shortest_image_width_heigth(image: Image) -> int:
    """
    Get the shortest width or height of an image.
    """
    point1 = image.metadata.coordinates.points[0]
    point3 = image.metadata.coordinates.points[2]
    width = point3[0] - point1[0]
    height = point3[1] - point1[1]
    return min(width, height)


def get_image_batch(chunks: list[CompositeElement]) -> list[dict]:
    """
    Get the image batch from the chunks.
    Input is a list of CompositeElements (chunks) with at least the following keys:
    - text: text of the chunk
    - element_id: ID of the chunk
    - page_number: page number of the chunk
    - chapter: chapter name of the chunk
    """
    image_batch = []
    storage = Path(config.OBJECT_STORAGE)
    storage.mkdir(parents=True, exist_ok=True)

    # loop through all chunks, looking for images
    for chunk in chunks:
        for element in chunk.metadata.orig_elements:
            if "Image" in str(type(element)):
                if get_shortest_image_width_heigth(element) < 120:
                    # Skip images that are too small (usually icons)
                    continue

                caption_block = None
                caption = None
                # check if the next element is a caption
                try:
                    next_element = chunk.metadata.orig_elements[
                        chunk.metadata.orig_elements.index(element) + 1
                    ]
                    if "Caption" in str(type(next_element)):
                        caption = next_element.text
                        caption_block = f"Here is the image caption: {caption}\n\n"
                except IndexError:
                    # If there is no next element, skip the caption
                    pass

                _dict = element.to_dict()
                image_id = element._element_id

                img_data = _dict["metadata"]["image_base64"].split(",", 1)[-1]
                img_bytes = base64.b64decode(img_data)
                out_path = storage / f"{image_id}.jpg"
                with out_path.open("wb") as f:
                    f.write(img_bytes)

                image_batch.append(
                    {
                        "image": element.metadata.image_base64,
                        "image_id": element._element_id,
                        "caption_block": caption_block,
                        "caption": caption,
                        "chunk_text": chunk.text,
                        "chunk_id": chunk._element_id,
                        # TODO add guideline name
                        # "guideline_name": chunk.metadata.guideline_name,
                    }
                )
    return image_batch


def get_image_description(image_batch: list[dict], batch_size: int = 100) -> list[dict]:
    """
    Get the image description for each image in the batch.
    Input is a list of dictionaries with at least the following keys:
    - image: base64 string of the image
    - image_id: ID of the image
    - caption: caption of the image (if any)
    - chunk_text: text of the chunk in which the image was found
    - chunk_id: ID of the chunk in which the image was found
    """
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            # System-Message
            (
                "system",
                "Du bist ein medizinischer Experte, der medizinische Leitlinien analysiert. "
                "Nutze dein Wissen, um die Bilder zu beschreiben, die du siehst. "
                "Sei spezifisch in Bezug auf Grafiken, wie z.B. Workflows und Entscheidungsbäume, wenn die Bilder solche zeigen.",
            ),
            # User-Message mit Text und Bild
            (
                "user",
                [
                    {
                        "type": "text",
                        "text": """Hier ist der Text des Dokuments, in dem das Bild gefunden wurde:

    {chunk_text}

    {caption_block}Beschreibe die Grafik im Detail. Das Bild ist Teil der medizinischen Leitlinie {guideline_name}. Sei spezifisch in Bezug auf Grafiken, wie z.B. Workflows, wenn du welche siehst. Beginne deine Nachricht mit 'Die Grafik zeigt '""",
                    },
                    {
                        "type": "image_url",
                        # hier wird dein base64-String in {image} reingesetzt
                        "image_url": {"url": "data:image/jpeg;base64,{image}"},
                    },
                ],
            ),
        ]
    )

    chain = chat_prompt | model | StrOutputParser()

    @retry(
        retry=retry_if_exception_type(Exception),
        wait=wait_exponential(multiplier=1.3, min=5, max=60),
        stop=stop_after_attempt(5),
    )
    def _describe(sub_batch):
        return chain.batch(sub_batch, {"max_concurrency": 3})

    for i in range(0, len(image_batch), batch_size):
        sub = image_batch[i : i + batch_size]
        try:
            descriptions = _describe(sub)
        except Exception as e:
            logger.warning(
                f"Image description batch {i}-{i + len(sub) - 1} failed: {e}"
            )
            descriptions = [None] * len(sub)
        # attach whatever succeeded (or None) back onto each dict
        for item, desc in zip(sub, descriptions):
            item["description"] = desc

    return image_batch


def save_texts_to_db(chunks: list[CompositeElement], batch_size: int) -> None:
    """
    Save the text chunks to the database.
    Input is a list of CompositeElements (chunks) with at least the following keys:
    - text: text of the chunk
    - element_id: ID of the chunk
    - page_number: page number of the chunk
    - chapter: chapter name of the chunk
    """
    docs = []
    for chunk in chunks:
        chunk_image_ids = []
        for el in chunk.metadata.orig_elements:
            if "Image" in str(type(el)):
                shortest = get_shortest_image_width_heigth(el)
                # Filter out images that are too small (usually icons)
                if shortest < 120:
                    continue
                _ = el.to_dict()  # This is needed to get the element ID
                chunk_image_ids.append(el._element_id)
        doc_text = get_chunk_text_with_links(chunk)
        doc = Document(
            page_content=doc_text,
            metadata={
                "id": chunk._element_id,
                "page": chunk.metadata.page_number,
                "type": "text",
                "images": chunk_image_ids,
                "chapter": chunk.metadata.to_dict().get("chapter", None),
            },
        )
        docs.append(doc)

    for i in range(0, len(docs), batch_size):
        batch = docs[i : i + batch_size]
        try:
            add_batch(batch)
        except Exception as e:
            raise e


def save_images_to_db(image_summaries: list[dict], batch_size: int) -> None:
    """
    Save the images to the database.
    Input is a list of dictionaries with at least the following keys:
    - image_id: ID of the image
    - caption: caption of the image (if any)
    - text: textual description of the image
    - chunk_id: ID of the chunk in which the image was found
    """
    images = []

    for image_summary in image_summaries:
        image = Document(
            page_content=image_summary.get("description"),
            metadata={
                "id": image_summary.get("image_id"),
                "chunk_id": image_summary.get("chunk_id"),
                "type": "image",
                "caption": image_summary.get("caption"),
            },
        )
        images.append(image)

    for i in range(0, len(images), batch_size):
        batch = images[i : i + batch_size]
        try:
            add_batch(batch)
        except Exception as e:
            raise e


def insert_links_as_md(text: str, links: list[dict]) -> str:
    """
    Inject markdown links into `text`, matching each link["text"]
    against `text` while ignoring ALL whitespace.
    """
    # build a "normalized" version of text (no whitespace) + a map back to original indices
    orig = text
    norm_chars = []
    index_map = []  # index_map[i_norm] = i_orig
    for i, ch in enumerate(orig):
        if not ch.isspace():
            norm_chars.append(ch)
            index_map.append(i)
    norm_text = "".join(norm_chars)

    # for each link, find its no‐ws form in norm_text, then map back
    #    replace from end to avoid shifting earlier spans
    for link in reversed(links):
        try:
            link_text = link["text"]
            url = link["url"]
            if not url.startswith("http"):
                # Skip links that are not HTTP(S) --> internal PDF links
                continue

            # collapse whitespace in link_text
            link_text_no_ws = re.sub(r"\s+", "", link_text)
            pat = re.compile(re.escape(link_text_no_ws))
            m = pat.search(norm_text)
            if not m:
                logger.warning(f"Could not find link: {link_text} in text")
                continue

            # map normalized indices back to original
            start_norm, end_norm = m.span()
            start_orig = index_map[start_norm]
            end_orig = index_map[end_norm - 1] + 1

            # extract the real snippet, build markdown, replace in orig
            snippet = orig[start_orig:end_orig]
            replacement = f"[{snippet}]({url})"
            orig = orig[:start_orig] + replacement + orig[end_orig:]

            # also update norm_text and index_map so further links don't mis‐align
            # remove that matched portion from norm_text & its indices
            del norm_chars[start_norm:end_norm]
            del index_map[start_norm:end_norm]
            norm_text = "".join(norm_chars)
        except Exception as e:
            logger.warning(f"Error inserting link: {link['text']}, {e}")
            continue

    return orig


def get_chunk_text_with_links(chunk) -> str:
    """
    Reconstruct chunk text from chunk.metadata.orig_elements,
    replacing any detected links with Markdown [text](url).
    """
    parts = []
    for el in chunk.metadata.orig_elements:
        # grab the plain text
        text = getattr(el, "text", "")
        # if this element has link‑metadata, apply inline replacements
        links = el.metadata.links if hasattr(el.metadata, "links") else []
        text = insert_links_as_md(text, links)
        parts.append(text)
    return " ".join(parts)


def process_file(input_pdf: str):
    logger.info(f"[{input_pdf.split('/')[-1]}] chunking…")
    chunks = partition_pdf(
        filename=input_pdf,
        infer_table_structure=True,  # for now, we don't extract the table HTML, as we dont want to work with the HTML
        strategy="hi_res",
        languages=["deu"],
        extract_image_block_types=[
            "Image",
            "Table"
        ],  # Add 'Table' to list to extract image of tables  # TODO add table extraction
        extract_image_block_to_payload=True,  # if true, will extract base64 for API usage
        chunking_strategy="by_title",
        max_characters=3200,
        combine_text_under_n_chars=1000,
        new_after_n_chars=1800,
    )
    logger.info(f"[{input_pdf.split('/')[-1]}] Found {len(chunks)} chunks.")

    return chunks

    image_batch = get_image_batch(chunks)
    if image_batch:
        # Get the image summaries
        logger.info("Getting image descriptions")
        image_descriptions = get_image_description(
            image_batch, batch_size=DB_BATCH_SIZE
        )

        # Save the image summaries to the database
        logger.info(f"Saving {len(image_batch)} image descriptions to the database")
        save_images_to_db(image_descriptions, batch_size=DB_BATCH_SIZE)

    return input_pdf, chunks




In [None]:

    image_batch = get_image_batch(chunks)
    if image_batch:
        # Get the image summaries
        logger.info("Getting image descriptions")
        image_descriptions = get_image_description(
            image_batch, batch_size=DB_BATCH_SIZE
        )

        # Save the image summaries to the database
        logger.info(f"Saving {len(image_batch)} image descriptions to the database")
        save_images_to_db(image_descriptions, batch_size=DB_BATCH_SIZE)

    return input_pdf, chunks




In [32]:
DB_BATCH_SIZE = 100
SPLIT_PDF = True
INPUT_PDFS = ["pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf"]

for pdf_file in INPUT_PDFS:
    logger.info(f"Processing file: {pdf_file}")
    chunks = process_file(pdf_file)

    # # os.remove(chapter_pdf)

    # # Get the image batch
    # logger.info("Getting image batch")
    # image_batch = get_image_batch(chunks)

    # if image_batch:
    #     # Get the image summaries
    #     logger.info("Getting image descriptions")
    #     image_descriptions = get_image_description(
    #         image_batch, batch_size=DB_BATCH_SIZE
    #     )

    #     # Save the image summaries to the database
    #     logger.info(f"Saving {len(image_batch)} image descriptions to the database")
    #     save_images_to_db(image_descriptions, batch_size=DB_BATCH_SIZE)

    # # Save the text chunks to the database
    # logger.info("Saving text chunks to the database")
    # save_texts_to_db(chunks, batch_size=DB_BATCH_SIZE)
    # logger.info(f"Saved {len(chunks)} chunks to the database")

chunks

2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,561 - INFO - Processing file: pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf
2025-05-14 10:00:35,564 - INFO - [001_044l_S1P

[<unstructured.documents.elements.CompositeElement at 0x37365f310>,
 <unstructured.documents.elements.CompositeElement at 0x37360d860>,
 <unstructured.documents.elements.CompositeElement at 0x37360e970>,
 <unstructured.documents.elements.CompositeElement at 0x37360f930>,
 <unstructured.documents.elements.CompositeElement at 0x35eaaecf0>,
 <unstructured.documents.elements.CompositeElement at 0x37360df60>,
 <unstructured.documents.elements.CompositeElement at 0x37360c050>,
 <unstructured.documents.elements.CompositeElement at 0x37360c520>,
 <unstructured.documents.elements.CompositeElement at 0x37360c830>,
 <unstructured.documents.elements.CompositeElement at 0x37360d2b0>,
 <unstructured.documents.elements.CompositeElement at 0x37360c1a0>,
 <unstructured.documents.elements.CompositeElement at 0x37360ecf0>,
 <unstructured.documents.elements.CompositeElement at 0x37360d320>,
 <unstructured.documents.elements.CompositeElement at 0x37360fc40>,
 <unstructured.documents.elements.CompositeEleme

In [34]:
for el in chunks[10].metadata.orig_elements:
    display(el.to_dict())

{'type': 'Title',
 'element_id': '336411ec-b7c8-4d23-9f71-cd3ee9082593',
 'text': 'Abbildung 2. Kardiale Symptome bei LAST.',
 'metadata': {'coordinates': {'points': ((196.83333333333331,
     1107.1999999999998),
    (196.83333333333331, 1140.5333333333333),
    (639.8666666666666, 1140.5333333333333),
    (639.8666666666666, 1107.1999999999998)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 13}}

{'type': 'NarrativeText',
 'element_id': '9b3175f5-15db-435d-9f40-e2e68bea09bb',
 'text': 'Risikofaktoren für Lokalanästhetika-Intoxikationen',
 'metadata': {'detection_class_prob': 0.4981774091720581,
  'coordinates': {'points': ((192.57080078125, 1372.3062744140625),
    (192.57080078125, 1418.0506666666668),
    (1118.0723333333333, 1418.0506666666668),
    (1118.0723333333333, 1372.3062744140625)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 13}}

{'type': 'NarrativeText',
 'element_id': '3447e22b-8642-4b76-9b96-3e9df909c95d',
 'text': 'Wesentliche gesicherte sowie vermutete Risikofaktoren für LAST-Ereignisse sind in Tabelle 2 dargestellt. Hierbei spielen patientenseitig vor allem die Altersklassen der unter 16- und der über 60-jährigen in Kombinationen mit eingeschränkten kardialen und zentralvenösen Komorbiditäten eine herausragende Rolle [24].',
 'metadata': {'detection_class_prob': 0.9006506204605103,
  'coordinates': {'points': ((188.93092346191406, 1555.1676025390625),
    (188.93092346191406, 1837.8999999999999),
    (1518.1966666666672, 1837.8999999999999),
    (1518.1966666666672, 1555.1676025390625)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 13}}

{'type': 'UncategorizedText',
 'element_id': 'e0d69ce2-0ca3-4849-83e8-ddab3e78ac6e',
 'text': '13',
 'metadata': {'coordinates': {'points': ((1469.3333333333335,
     2027.5666666666666),
    (1469.3333333333335, 2060.9),
    (1510.6999999999998, 2060.9),
    (1510.6999999999998, 2027.5666666666666)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 13}}

{'type': 'NarrativeText',
 'element_id': '3d1624bb-c13d-447d-b1da-4058f9dfc226',
 'text': 'Die Anwendung von Bupivacain sowie anderen lipophilen Lokalanästhetika in höheren',
 'metadata': {'detection_class_prob': 0.449284166097641,
  'coordinates': {'points': ((191.55545043945312, 205.56666666666672),
    (191.55545043945312, 238.90000000000003),
    (1510.6999999999996, 238.90000000000003),
    (1510.6999999999996, 205.56666666666672)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 14}}

{'type': 'NarrativeText',
 'element_id': '90520552-5f67-4056-976e-6a4d81806e19',
 'text': 'Dosierungen, wie sie vor allem bei peripheren und epiduralen Blockaden eingesetzt werden, ist',
 'metadata': {'detection_class_prob': 0.34748464822769165,
  'coordinates': {'points': ((191.88795471191406, 286.9),
    (191.88795471191406, 320.2333333333333),
    (1510.6999999999996, 320.2333333333333),
    (1510.6999999999996, 286.9)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 14}}

{'type': 'NarrativeText',
 'element_id': 'f7d4ea85-ae0d-463f-8b00-90b8cc39dec1',
 'text': 'ein typischer, modifizierbarer Risikofaktor für das Auftreten eines LAST-Ereignisses.',
 'metadata': {'detection_class_prob': 0.5844754576683044,
  'coordinates': {'points': ((196.8333333333332, 368.23333333333323),
    (196.8333333333332, 401.56666666666655),
    (1330.1999999999996, 401.56666666666655),
    (1330.1999999999996, 368.23333333333323)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 14}}

{'type': 'Title',
 'element_id': '4f8d7f9c-0800-4c89-a916-8e89e8e5c692',
 'text': 'Tabelle 2. Risiko-/ Einflussfaktoren der Lokalanästhetikaintoxikation (LAST), ergänzt nach [25]',
 'metadata': {'detection_class_prob': 0.42959606647491455,
  'coordinates': {'points': ((196.3920440673828, 528.0666666666667),
    (196.3920440673828, 553.8166666666667),
    (1172.9833333333333, 553.8166666666667),
    (1172.9833333333333, 528.0666666666667)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 14}}

{'type': 'Title',
 'element_id': 'ef18224c-6f80-4b0c-89a2-8c655451e217',
 'text': 'Risikofaktoren der LAST',
 'metadata': {'coordinates': {'points': ((211.83333333333334,
     588.7361666666667),
    (211.83333333333334, 610.9028333333333),
    (436.5096666666667, 610.9028333333333),
    (436.5096666666667, 588.7361666666667)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-05-10T23:32:02',
  'filetype': 'PPM',
  'languages': ['deu'],
  'page_number': 14}}

{'type': 'Table',
 'element_id': '6e6a95d2-99b6-45b9-8408-6fb2f8fc0d52',
 'text': 'Patienteneigenschaften - Extremes Alter (< 16 Jahre und > 60 Jahre) - Geringe Muskelmasse (v.a. Neugeborene, Kinder und ältere Patienten) - Weibliches Geschlecht - Schwangerschaft Komorbiditäten - Kardiale Vorerkrankungen (v.a. Arrhythmien, Reizleitungsstörungen/-abnormalitäten, kardiale Ischämie, Herzinsuffizienz) - Leber- und Nierenerkrankungen - Metabolische Störungen (v.a. Diabetes mellitus, Mitochondropathien, Isovalerianazidämie, Carnitinmangel) - Zentralnervöse Krankheiten - Geringe Plasma-Protein-Bindung, Hypalbuminämie (Leberinsuffizienz, Mangelernährung, Kinder, Schwangerschaft) Lokalanästhetikum und Regionalverfahren - Unzureichende präventive Maßnahmen (s.u.) - Physikochemische Eigenschaften des LA - Verwendung von hochlipophilen LA, v.a. Bupivacain - Dosis, Testdosis - Kontinuierliche Infusion/Katheterverfahren - Periphere Nervenblockaden - Injektionsort mit hoher Resorptionsquote (z. B. Int

In [28]:
import base64
import io
from PIL import Image as PILImage

def display_element(el):
    try:
        if hasattr(el, "metadata") and hasattr(el.metadata, "image_base64"):
            img_data = el.metadata.image_base64
            img_bytes = base64.b64decode(img_data)
            img = PILImage.open(io.BytesIO(img_bytes))
            img.show()
    except Exception as e:
        print(f"Error displaying element: {e}")
display_element(chunks[10].metadata.orig_elements[-1])

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


In [41]:
# render html
from IPython.core.display import HTML
def render_html(html):
    display(HTML(html))

render_html(chunks[10].metadata.orig_elements[-1].metadata.text_as_html)

0,1
Patienteneigenschaften,Extremes Alter (< 16 Jahre und > 60 Jahre)
Patienteneigenschaften,"Geringe Muskelmasse (v.a. Neugeborene, Kinder und ältere"
Patienteneigenschaften,Patienten)
Patienteneigenschaften,Weibliches Geschlecht
Patienteneigenschaften,Schwangerschaft
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Kardiale Vorerkrankungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,"(v.a. Arrhythmien, Reizleitungsstörungen/-abnormalitäten, kardiale Ischämie, Herzinsuffizienz)"
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Leber- und Nierenerkrankungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Metabolische Störungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,"(v.a. Diabetes mellitus, Mitochondropathien, Isovalerianazidämie, Carnitinmangel)"


In [49]:
def reconstruct_table(table: dict):
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            # System-Message
            (
                "system",
                "Du bist ein medizinischer Experte, der medizinische Leitlinien analysiert. "
                "Deine Aufgabe ist es, Tabellen zu rekonstruieren. "
                "Du erhältst eine Tabelle in Form von html, die aber Fehler enthalten kann. Außerdem bekommst du einen Screenshot der Tabelle. "
                "Korrigiere die Fehler in der Tabelle und gib sie als html zurück. "
                "Versuche ein einfaches Design zu verwenden, das die Tabelle gut lesbar macht. "
                "Gib NUR den rekonstruierten HTML-Code zurück, ohne zusätzliche Erklärungen oder Kommentare.",
            ),
            # User-Message mit Text und Bild
            (
                "user",
                [
                    {
                        "type": "text",
                        "text": "Hier ist das extrahiert HTML der Tabelle:\n\n{table_html}"
                    },
                    {
                        "type": "image_url",
                        # hier wird dein base64-String in {image} reingesetzt
                        "image_url": {"url": "data:image/jpeg;base64,{table_screenshot}"},
                    },
                ],
            ),
        ]
    )

    chain = chat_prompt | model | StrOutputParser()

    result = chain.invoke(
        {
            "table_html": table.metadata.text_as_html,
            "table_screenshot": table.metadata.image_base64,
        }
    )

    return result

result = reconstruct_table(chunks[10].metadata.orig_elements[-1])

In [50]:
render_html(result)

Risikofaktoren der LAST,Risikofaktoren der LAST.1
Patienteneigenschaften,Extremes Alter (< 16 Jahre und > 60 Jahre)
Patienteneigenschaften,"Geringe Muskelmasse (v.a. Neugeborene, Kinder und ältere Patienten)"
Patienteneigenschaften,Weibliches Geschlecht
Patienteneigenschaften,Schwangerschaft
Patienteneigenschaften,
Komorbiditäten,"Kardiale Vorerkrankungen (v.a. Arrhythmien, Reizleitungsstörungen/-abnormalitäten, kardiale Ischämie, Herzinsuffizienz)"
Komorbiditäten,Leber- und Nierenerkrankungen
Komorbiditäten,"Metabolische Störungen (v.a. Diabetes mellitus, Mitochondropathien, Isovalerianazidämie, Carnitinmangel)"
Komorbiditäten,Zentralnervöse Krankheiten
Komorbiditäten,"Geringe Plasma-Protein-Bindung, Hypalbuminämie (Leberinsuffizienz, Mangelernährung, Kinder, Schwangerschaft)"


In [72]:
render_html(chunks[10].metadata.orig_elements[-1].metadata.text_as_html)

0,1
Patienteneigenschaften,Extremes Alter (< 16 Jahre und > 60 Jahre)
Patienteneigenschaften,"Geringe Muskelmasse (v.a. Neugeborene, Kinder und ältere"
Patienteneigenschaften,Patienten)
Patienteneigenschaften,Weibliches Geschlecht
Patienteneigenschaften,Schwangerschaft
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Kardiale Vorerkrankungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,"(v.a. Arrhythmien, Reizleitungsstörungen/-abnormalitäten, kardiale Ischämie, Herzinsuffizienz)"
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Leber- und Nierenerkrankungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,Metabolische Störungen
Komorbiditäten Lokalanästhetikum und Regionalverfahren,"(v.a. Diabetes mellitus, Mitochondropathien, Isovalerianazidämie, Carnitinmangel)"


In [48]:
result

'<table>\n  <tbody>\n    <tr>\n      <td rowspan="5">Patienteneigenschaften</td>\n      <td>Extremes Alter (&lt; 16 Jahre und &gt; 60 Jahre)</td>\n    </tr>\n    <tr>\n      <td>Geringe Muskelmasse (v.a. Neugeborene, Kinder und ältere Patienten)</td>\n    </tr>\n    <tr>\n      <td>Weibliches Geschlecht</td>\n    </tr>\n    <tr>\n      <td>Schwangerschaft</td>\n    </tr>\n    <tr>\n      <td rowspan="6">Komorbiditäten</td>\n      <td>Kardiale Vorerkrankungen (v.a. Arrhythmien, Reizleitungsstörungen/-abnormalitäten, kardiale Ischämie, Herzinsuffizienz)</td>\n    </tr>\n    <tr>\n      <td>Leber- und Nierenerkrankungen</td>\n    </tr>\n    <tr>\n      <td>Metabolische Störungen (v.a. Diabetes mellitus, Mitochondropathien, Isovalerianazidämie, Carnitinmangel)</td>\n    </tr>\n    <tr>\n      <td>Zentralnervöse Krankheiten</td>\n    </tr>\n    <tr>\n      <td>Geringe Plasma-Protein-Bindung, Hypalbuminämie (Leberinsuffizienz, Mangelernährung, Kinder, Schwangerschaft)</td>\n    </tr>\n    <t

In [52]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode

pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

INPUT_PDFS = ["pdfs/001_044l_S1Praevention-Therapie-systemischen-Lokalanaesthetika-Intoxikation-LAST_2025-01-abgelaufen.pdf"]


In [55]:
converted = doc_converter.convert(INPUT_PDFS[0])



In [64]:
converted.pages[12].cells

[PdfTextCell(index=0, rgba=ColorRGBA(r=0, g=0, b=0, a=255), rect=BoundingRectangle(r_x0=70.5, r_y0=49.70600000000002, r_x1=73.212, r_y1=49.70600000000002, r_x2=73.212, r_y2=39.81600000000003, r_x3=70.5, r_y3=39.81600000000003, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), text=' ', orig=' ', text_direction=<TextDirection.LEFT_TO_RIGHT: 'left_to_right'>, confidence=1.0, from_ocr=False, rendering_mode=<PdfCellRenderingMode.UNKNOWN: -1>, widget=False, font_key='/TT0', font_name='/NRZFRX+Calibri'),
 PdfTextCell(index=1, rgba=ColorRGBA(r=0, g=0, b=0, a=255), rect=BoundingRectangle(r_x0=305.94, r_y0=49.70600000000002, r_x1=308.652, r_y1=49.70600000000002, r_x2=308.652, r_y2=39.81600000000003, r_x3=305.94, r_y3=39.81600000000003, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), text=' ', orig=' ', text_direction=<TextDirection.LEFT_TO_RIGHT: 'left_to_right'>, confidence=1.0, from_ocr=False, rendering_mode=<PdfCellRenderingMode.UNKNOWN: -1>, widget=False, font_key='/TT0', font_name='/NRZFRX+C

In [71]:
converted.document.pictures

[PictureItem(self_ref='#/pictures/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.PICTURE: 'picture'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=71.6587905883789, t=776.8360061645508, r=317.7592468261719, b=746.927490234375, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))], captions=[], references=[], footnotes=[], image=None, annotations=[]),
 PictureItem(self_ref='#/pictures/1', parent=RefItem(cref='#/body'), children=[RefItem(cref='#/texts/74'), RefItem(cref='#/texts/75'), RefItem(cref='#/texts/76'), RefItem(cref='#/texts/77'), RefItem(cref='#/texts/78'), RefItem(cref='#/texts/79')], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.PICTURE: 'picture'>, prov=[ProvenanceItem(page_no=11, bbox=BoundingBox(l=108.9175796508789, t=659.7506103515625, r=490.43743896484375, b=517.7368774414062, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))], captions=[], refere

In [None]:
from unstructured.ingest.connector.local import SimpleLocalConfig
from unstructured.ingest.interfaces import ReadConfig, ProcessorConfig, PartitionConfig
from unstructured.ingest.runner import LocalRunner

def get_shortest_image_width_heigth(image: Image) -> int:
    """
    Get the shortest width or height of an image.
    """
    point1 = image.metadata.coordinates.points[0]
    point3 = image.metadata.coordinates.points[2]
    width = point3[0] - point1[0]
    height = point3[1] - point1[1]
    return min(width, height)


    
# 1. Definieren Sie Ihre Filter-Funktion
def filter_small_images(file_path, metadata, elements):
    """
    Entfernt Image-Elemente, deren Breite*Höhe
    unter dem Schwellwert liegen.
    """
    MIN_PIXELS = 120
    filtered = []
    for el in elements:
        if "Image" in str(type(el)):
            shortest = get_shortest_image_width_heigth(el)
            if shortest < MIN_PIXELS:
                continue
        filtered.append(el)
    return filtered

# 2. Konfiguration der einzelnen Schritte
read_config = ReadConfig()
partition_config = PartitionConfig()  # Standard-Partitionierung
processor_config = ProcessorConfig(
    # Filter-Funktion vor dem Partitioning
    post_index_filter=filter_small_images,
    verbose=True,
    output_dir="./output",
)

# 3. Runner initialisieren und ausführen
runner = LocalRunner(
    connector_config=SimpleLocalConfig(
        input_directory="./input_docs",
        recursive=True,
    ),
    read_config=read_config,
    partition_config=partition_config,
    processor_config=processor_config,
)

runner.run()
def add_custom_metadata(file_path, metadata, elements):
    # Dokument-ID aus Dateiname extrahieren
    metadata["doc_id"] = Path(file_path).stem
    metadata["source"] = "vertriebsreports"
    for el in elements:
        el.metadata.update(metadata)
    return elements

processor_config = ProcessorConfig(
    post_index_filter=None,
    post_download_filter=None,
    post_uncompress_filter=None,
    partition_config=partition_config,
    add_metadata_fn=add_custom_metadata,
)