In [None]:
# Dependencies
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

In [None]:
# Checking the pathes 
from pathlib import Path

folder_path = Path("data/pdfs/")
input_paths = []
for file_path in folder_path.rglob("*"):
    input_paths.append(file_path)
    print(file_path)
print(len(input_paths))    

# Run Converter

In [None]:
# Leaving full customization to show possibilities

doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
                ),
            },
        )
    )

# Different Conversion Process with PDF

In [None]:
#Convert and write

conv_results = doc_converter.convert_all(input_paths)

for res in conv_results:
    out_path = Path("data/output/")
    print(
        f"Document {res.input.file.name} converted."
        f"\nSaved markdown output to: {str(out_path)}"
    )

    # Export Docling document format to markdowndoc:
    with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
        fp.write(res.document.export_to_markdown())

In [None]:
# large file (~70 page pdf with multiple tables/images) and much longer run time