This notebook deals with some advance docling options
1) Image extractions - as embedding, storage in specific folder with or without custom names
- Image scale can be modified based on need and the clarity of the image 
2) Table extraction  - converting the pdf (with only tables) into json


Bonus tip: 
1) Improving the speed by not making use of OCR if not needed

In [None]:
# Image extraction as embeddings / save it in a folder
import os
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
)
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend

from pathlib import Path
artifacts_path="C:/Users/z0047npb/Desktop/docling/models"
accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=False,)

pipeline_options = PdfPipelineOptions(
    accelerator_options=accelerator_options,
    artifacts_path=artifacts_path,
    #generate_page_images=False,
    generate_picture_images=True,
    images_scale= 3.0 
)
pipeline_options.do_ocr = False # Bonus Tip : 01 improving the speed of the pipeline    
pipeline_options.do_table_structure = True

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
pdf_path = r"D:\personal_projects\Docling_project\pdf\ACTH_-_IMMULITE_2000_Systems_-_Rev_20_DXDCM_090234ed841fe8f9-1752273527910.pdf"
markdown_folder = r"D:\personal_projects\Docling_project\output"
os.makedirs(markdown_folder, exist_ok=True)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=DoclingParseV4DocumentBackend,
        )
    }
)

In [None]:
doc_filename = os.path.splitext(os.path.basename(pdf_path))[0]
print(doc_filename)
image_folder = Path(markdown_folder) / "images" / doc_filename
image_folder.mkdir(parents=True, exist_ok=True)

IMMULITE_2000_Systems_Interface_Specifications
D:\personal_projects\Docling_project\pdf\IMMULITE_2000_Systems_Interface_Specifications.pdf


In [13]:
result = converter.convert(pdf_path)

In [None]:
# Save markdown with embedded pictures
# md_filename = os.path.join(markdown_folder, f"{doc_filename}"+ "with_image_embed.md")  
# result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# Save markdown with externally referenced pictures
md_filename = os.path.join(markdown_folder, f"{doc_filename}"+ "with_image_ref.md")  
result.document.save_as_markdown(md_filename,image_mode=ImageRefMode.REFERENCED,artifacts_dir=image_folder)


To save the images with custom names

In [None]:
picture_counter = 0
md_filename = os.path.join(markdown_folder, f"{doc_filename}"+ "_with_ref_cutom_image.md") 
for element, _level in result.document.iterate_items():
    if isinstance(element, PictureItem):
        picture_counter +=1
        element_image_filename  = (image_folder/ f"{doc_filename}-picture-{picture_counter}.png")
    
        with element_image_filename.open("wb") as img_file:
            element.get_image(result.document).save(img_file, "PNG")
            element.image.uri = element_image_filename

result.document.save_as_markdown(md_filename,image_mode=ImageRefMode.REFERENCED,artifacts_dir=image_folder)

Table Extraction 
1. If the entire document is of Tables and no text, and we need to extract the content from it and convert it back to the json format

In [None]:
# First Notebook to be run so to get the paramaters set up
pdf_path = r"D:\online_help_pdf_parsing\pdf_file\High_Sensitivity_C_Reactive_Protein_2_OUS_-_Atellica_CH_-_Rev_04_DXDCM_090234ed823d6b7d-1743183404975.pdf"
doc_filename = os.path.splitext(os.path.basename(pdf_path))[0]
result = converter.convert(pdf_path)
output_dir_path = Path(r"D:\online_help_pdf_parsing\output")

In [None]:
import json
import pandas as pd 
all_tables_json = []
for table_ix, table in enumerate(result.document.tables):
    table_df: pd.DataFrame = table.export_to_dataframe()
    
    all_tables_json.append(table_df.to_dict(orient="records"))

merged_json_filename = output_dir_path / f"{doc_filename}-all-tables_v1.json"
print(f"Saving all tables to {merged_json_filename}")
with open(merged_json_filename, "w", encoding="utf-8") as f:
    json.dump(all_tables_json, f, indent=2, ensure_ascii=False)


print(f"Total tables extracted: {len(result.document.tables)}") 

If we need to store the tables exclusively form the pdf's like how we take the images from the documents

In [None]:
for element, _level in result.document.iterate_items():
    if isinstance(element, TableItem):
        table_counter += 1
        element_image_filename = (
            output_dir_path / f"{doc_filename}-table-{table_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(result.document).save(fp, "PNG")