In [7]:
from pdf2image import convert_from_path
import os

In [8]:
def pdf_to_images(pdf_dir, output_base_dir, dpi=300):
    """
    Converts all PDF files in a directory to images and saves them in subdirectories.
    
    Args:
        pdf_dir (str): Path to the directory containing PDF files.
        output_base_dir (str): Base directory to save the output images.
        dpi (int): Resolution of the output images in dots per inch (default=300).
    """
    # Ensure the output base directory exists
    os.makedirs(output_base_dir, exist_ok=True)

    # Get a list of PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

    # Process each PDF file
    for idx, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        output_dir = os.path.join(output_base_dir, str(idx))  # Subdirectory for each PDF
        os.makedirs(output_dir, exist_ok=True)

        print(f"Processing {pdf_file}...")

        # Convert PDF to images
        try:
            pages = convert_from_path(pdf_path, dpi=dpi)
            for page_num, page in enumerate(pages):
                image_path = os.path.join(output_dir, f"page_{page_num + 1}.png")
                page.save(image_path, "PNG")
                print(f"Saved: {image_path}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

In [9]:
pdf_dir = "data/raw/pdf/"
output_base_dir = "data/processed/"
pdf_to_images(pdf_dir, output_base_dir)

Processing UEB-Sample-1-PDF-file1.pdf...
Saved: data/processed/0\page_1.png
Saved: data/processed/0\page_2.png
Processing UEB-Sample-2-PDF-file.pdf...
Saved: data/processed/1\page_1.png
Saved: data/processed/1\page_2.png
Processing UEB-Sample-3-PDF-file.pdf...
Saved: data/processed/2\page_1.png
Saved: data/processed/2\page_2.png
Saved: data/processed/2\page_3.png
Processing UEB-Sample-4-PDF-file.pdf...
Saved: data/processed/3\page_1.png
Saved: data/processed/3\page_2.png
Processing UEB-Sample-5-PDF-file.pdf...
Saved: data/processed/4\page_1.png
Saved: data/processed/4\page_2.png
Saved: data/processed/4\page_3.png
Processing UEB-Sample-6-PDF-file.pdf...
Saved: data/processed/5\page_1.png
Processing UEB-Sample-7-PDF-file.pdf...
Saved: data/processed/6\page_1.png
Saved: data/processed/6\page_2.png
Saved: data/processed/6\page_3.png
Processing UEB-Sample-7a-PDF-file.pdf...
Saved: data/processed/7\page_1.png
Saved: data/processed/7\page_2.png
Processing UEB-Sample-8-PDF-file.pdf...
Saved: d