In [1]:
from pdf2image import convert_from_path
import os

In [2]:
def pdf_to_images(pdf_dir, output_base_dir, dpi=300):
    """
    Converts all PDF files in a directory to images and saves them in subdirectories.
    
    Args:
        pdf_dir (str): Path to the directory containing PDF files.
        output_base_dir (str): Base directory to save the output images.
        dpi (int): Resolution of the output images in dots per inch (default=300).
    """
    # Ensure the output base directory exists
    os.makedirs(output_base_dir, exist_ok=True)

    # Get a list of PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

    # Process each PDF file
    for idx, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        output_dir = os.path.join(output_base_dir, str(idx))  # Subdirectory for each PDF
        os.makedirs(output_dir, exist_ok=True)

        print(f"Processing {pdf_file}...")

        # Convert PDF to images
        try:
            pages = convert_from_path(pdf_path, dpi=dpi)
            for page_num, page in enumerate(pages):
                image_path = os.path.join(output_dir, f"page_{page_num + 1}.png")
                page.save(image_path, "PNG")
                print(f"Saved: {image_path}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

In [3]:
pdf_dir = "../data/raw/pdf/"
output_base_dir = "../data/test"
pdf_to_images(pdf_dir, output_base_dir)

Processing document_2.pdf...
Saved: ../data/test/0/page_1.png
Processing document_1.pdf...
Saved: ../data/test/1/page_1.png
Processing document_4.pdf...
Saved: ../data/test/2/page_1.png
Processing document_3.pdf...
Saved: ../data/test/3/page_1.png
Processing document_5.pdf...
Saved: ../data/test/4/page_1.png
