In [7]:
from pdf2image import convert_from_path
import os

In [8]:
def pdf_to_images(pdf_dir, output_base_dir, dpi=300):
    """
    Converts all PDF files in a directory to images and saves them in the output base directory.
    
    Args:
        pdf_dir (str): Path to the directory containing PDF files.
        output_base_dir (str): Base directory to save the output images.
        dpi (int): Resolution of the output images in dots per inch (default=300).
    """
    # Ensure the output base directory exists
    os.makedirs(output_base_dir, exist_ok=True)

    # Get a list of PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        output_image_path = os.path.join(output_base_dir, f"{os.path.splitext(pdf_file)[0]}.png")

        print(f"Processing {pdf_file}...")

        # Convert PDF to image
        try:
            pages = convert_from_path(pdf_path, dpi=dpi)
            if pages:
                pages[0].save(output_image_path, "PNG")
                print(f"Saved: {output_image_path}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

In [9]:
pdf_dir = "../data/raw/pdf/"
output_base_dir = "../data/test"
pdf_to_images(pdf_dir, output_base_dir)

Processing document_6.pdf...
Saved: ../data/test/document_6.png
Processing document_2.pdf...
Saved: ../data/test/document_2.png
Processing document_9.pdf...
Saved: ../data/test/document_9.png
Processing document_17.pdf...
Saved: ../data/test/document_17.png
Processing document_13.pdf...
Saved: ../data/test/document_13.png
Processing document_8.pdf...
Saved: ../data/test/document_8.png
Processing document_16.pdf...
Saved: ../data/test/document_16.png
Processing document_11.pdf...
Saved: ../data/test/document_11.png
Processing document_1.pdf...
Saved: ../data/test/document_1.png
Processing document_18.pdf...
Saved: ../data/test/document_18.png
Processing document_20.pdf...
Saved: ../data/test/document_20.png
Processing document_10.pdf...
Saved: ../data/test/document_10.png
Processing document_4.pdf...
Saved: ../data/test/document_4.png
Processing document_14.pdf...
Saved: ../data/test/document_14.png
Processing document_15.pdf...
Saved: ../data/test/document_15.png
Processing document_19