In [1]:
import logging
from pathlib import Path
from pydantic import BaseModel, Field, field_validator
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc.base import ImageRefMode


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFToMarkdownConfig(BaseModel):
    """
    Configuration for the PDFToMarkdownConverter.

    Attributes:
        pdf_file (str): The name of the PDF file to convert.
        main_folder (Path): The main folder where input and output directories are located.
        input_folder (Path): The folder containing the input PDF file.
        output_folder (Path): The folder where converted Markdown files will be saved.
        source (Path): The full path to the input PDF file.
    """
    pdf_file: str
    main_folder: Path = Field(default=Path("data"))
    input_folder: Path = Field(default_factory=lambda: Path("data/input"))
    output_folder: Path = Field(default_factory=lambda: Path("data/output"))

    @property
    def source(self) -> Path:
        return self.input_folder / self.pdf_file

    @field_validator("input_folder", "output_folder", mode="before")
    def ensure_directory_exists(cls, v):
        path = Path(v)
        path.mkdir(parents=True, exist_ok=True)
        return path

class PDFToMarkdownConverter:
    """
    A class to handle the conversion of PDF files to Markdown format using Pydantic for configuration.
    """
    def __init__(self, config: PDFToMarkdownConfig):
        """
        Initialize the PDFToMarkdownConverter class with configuration.

        Args:
            config (PDFToMarkdownConfig): Configuration object for the converter.
        """
        self.config = config

    def _get_converter(self, embedded_images=False):
        """
        Set up and return a DocumentConverter instance.

        Args:
            embedded_images (bool, optional): Whether to include embedded images in the conversion. Defaults to False.

        Returns:
            DocumentConverter: An instance configured for PDF to Markdown conversion.
        """
        pipeline_options = PdfPipelineOptions()
        if embedded_images:
            pipeline_options.images_scale = 2.0
            pipeline_options.generate_page_images = True
            pipeline_options.generate_picture_images = True

        return DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )

    def convert_to_markdown(self, embedded_images=False):
        """
        Convert the PDF to Markdown format with optional image embedding.

        Args:
            embedded_images (bool, optional): Whether to include embedded images in the Markdown file. Defaults to False.

        Returns:
            Path: The path to the saved Markdown file.

        Raises:
            Exception: If any error occurs during the conversion process.
        """
        try:
            # Initialize the converter
            converter = self._get_converter(embedded_images=embedded_images)

            # Perform the conversion
            result = converter.convert(self.config.source)

            # Define the output file path
            output_filename = f"{Path(self.config.pdf_file).stem}-{'embedded' if embedded_images else 'placeholder'}.md"
            output_path = self.config.output_folder / output_filename

            # Save the converted Markdown file
            image_mode = ImageRefMode.EMBEDDED if embedded_images else ImageRefMode.PLACEHOLDER
            result.document.save_as_markdown(output_path, image_mode=image_mode)

            logger.info(f"Markdown file saved at: {output_path}")
            return output_path

        except FileNotFoundError as fnfe:
            logger.error(f"File not found: {fnfe}")
            raise

        except Exception as e:
            logger.error(f"An unexpected error occurred during conversion: {e}")
            raise

    def print_markdown(self, embedded_images=False):
        """
        Print the converted Markdown content to the console.

        Args:
            embedded_images (bool, optional): Whether to include embedded images in the Markdown content. Defaults to False.

        Raises:
            Exception: If any error occurs during the conversion process.
        """
        try:
            # Initialize the converter
            converter = self._get_converter(embedded_images=embedded_images)

            # Perform the conversion
            result = converter.convert(self.config.source)

            # Print the result
            image_mode = ImageRefMode.EMBEDDED if embedded_images else ImageRefMode.PLACEHOLDER
            print(result.document.export_to_markdown(image_mode=image_mode))
        except Exception as e:
            logger.error(f"An error occurred while printing Markdown content: {e}")
            raise

# Usage example
if __name__ == "__main__":
    pdf_file = "Wniosek_o_udzielenie_zmiane_warunkow_kredytu.docx"
    config = PDFToMarkdownConfig(pdf_file=pdf_file)
    converter = PDFToMarkdownConverter(config=config)

    try:
        # Convert to Markdown with placeholder images
        placeholder_path = converter.convert_to_markdown(embedded_images=False)
        print(f"Markdown with placeholders saved at: {placeholder_path}")

        # Convert to Markdown with embedded images
        embedded_path = converter.convert_to_markdown(embedded_images=True)
        print(f"Markdown with embedded images saved at: {embedded_path}")
    except Exception as e:
        logger.error(f"Conversion failed: {e}")


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Wniosek_o_udzielenie_zmiane_warunkow_kredytu.docx
INFO:docling.document_converter:Finished converting document Wniosek_o_udzielenie_zmiane_warunkow_kredytu.docx in 0.30 sec.
INFO:__main__:Markdown file saved at: data\output\Wniosek_o_udzielenie_zmiane_warunkow_kredytu-placeholder.md
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Wniosek_o_udzielenie_zmiane_warunkow_kredytu.docx


Markdown with placeholders saved at: data\output\Wniosek_o_udzielenie_zmiane_warunkow_kredytu-placeholder.md


INFO:docling.document_converter:Finished converting document Wniosek_o_udzielenie_zmiane_warunkow_kredytu.docx in 0.30 sec.
INFO:__main__:Markdown file saved at: data\output\Wniosek_o_udzielenie_zmiane_warunkow_kredytu-embedded.md


Markdown with embedded images saved at: data\output\Wniosek_o_udzielenie_zmiane_warunkow_kredytu-embedded.md
