In [29]:
import os
from docling.document_converter import DocumentConverter
from docling_core.types.doc import ImageRefMode

class PDFToMarkdownConverter:
    def __init__(self, pdf_file, main_folder="data"):
        self.pdf_file = pdf_file
        self.main_folder = main_folder
        self.input_folder = os.path.join(main_folder, "input")
        self.output_folder = os.path.join(main_folder, "output")
        self.source = os.path.join(self.input_folder, self.pdf_file)
        self.output_file = f"{os.path.splitext(self.pdf_file)[0]}.md"

    def convert_and_save(self):
        # Ensure output folder exists
        os.makedirs(self.output_folder, exist_ok=True)

        # Convert the document
        converter = DocumentConverter()
        result = converter.convert(self.source)

        # Save the result in Markdown format
        output_path = os.path.join(self.output_folder, self.output_file)
        with open(output_path, "w", encoding="utf-8") as fp:
            fp.write(result.document.export_to_markdown(image_mode=ImageRefMode.REFERENCED))

        return output_path

    def print_markdown(self):
        converter = DocumentConverter()
        result = converter.convert(self.source)
        print(result.document.export_to_markdown(image_mode=ImageRefMode.REFERENCED))


In [30]:
pdf_file = "DocLayNet.pdf"  # Specify the PDF file
converter = PDFToMarkdownConverter(pdf_file)
output_path = converter.convert_and_save()
print(f"Markdown file saved at: {output_path}")

Markdown file saved at: data\output\DocLayNet.md


In [31]:
converter.print_markdown()

## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

Birgit P/fitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

Michele Dol/fi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

## ABSTRACT

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very e/ffective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scienti/fic article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops 