In [1]:
import os
from pdf2image import convert_from_path
from pathlib import Path

def convert_pdfs_to_images(pdf_dir, output_base_dir, poppler_path, dpi=300):
    # Verify paths
    print(f"Poppler path: {poppler_path}")
    print(f"PDF directory: {pdf_dir}")
    print(f"Output directory: {output_base_dir}")
    
    if not os.path.exists(poppler_path):
        raise Exception(f"Poppler path does not exist: {poppler_path}")
    
    Path(output_base_dir).mkdir(parents=True, exist_ok=True)
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    print(f"Found {len(pdf_files)} PDF files")
    
    for pdf_file in pdf_files:
        pdf_name = os.path.splitext(pdf_file)[0]
        output_dir = os.path.join(output_base_dir, pdf_name)
        Path(output_dir).mkdir(exist_ok=True)
        
        pdf_path = os.path.join(pdf_dir, pdf_file)
        try:
            print(f"Processing {pdf_file}...")
            images = convert_from_path(
                pdf_path,
                dpi=dpi,
                poppler_path=poppler_path
            )
            
            for i, image in enumerate(images):
                image_path = os.path.join(output_dir, f'page_{i+1}.png')
                image.save(image_path, 'PNG')
                print(f'Saved {image_path}')
                
        except Exception as e:
            print(f'Error processing {pdf_file}: {str(e)}')

# Paths
pdf_dir = r'C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\raw\pdfs'
output_dir = r'C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images'
poppler_path = r"C:\Program Files\poppler\poppler-24.02.0\Library\bin"

# Run conversion
convert_pdfs_to_images(pdf_dir, output_dir, poppler_path)

Poppler path: C:\Program Files\poppler\poppler-24.02.0\Library\bin
PDF directory: C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\raw\pdfs
Output directory: C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images
Found 5 PDF files
Processing 2411.04106v1.pdf...
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_1.png
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_2.png
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_3.png
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_4.png
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_5.png
Saved C:\Users\thinkpad\Documents\GitHub\Textra_AI_Research\data\processed\pdf_images\2411.04106v1\page_6.png
Saved C:\Users\thinkpad\Documents\GitHub\Te