In [1]:
import os
from pdf2image import convert_from_path
import shutil

main_dir = 'WAMEX_DATA_EXTRACTED'

if main_dir:
    subfolders = [d for d in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, d))]
else:
    print("WAMEX_DATA_EXTRACTED NOT FOUND CHECK DIRECTORY")

def convert_pdf_to_images(pdf_path, output_folder, subfolder_name, file_idx):
    """
    Convert a PDF into images.
    :param pdf_path: Path to the PDF file.
    :param output_folder: Folder where the images should be saved.
    :param subfolder_name: Name of the subfolder.
    :param file_idx: Index of the PDF file.
    :return: None.
    """
    images = convert_from_path(pdf_path, dpi=300)
    for i, image in enumerate(images):
        image.save(os.path.join(output_folder, f"{subfolder_name}_{file_idx + 1}_{i + 1}.jpg"), 'JPEG')

def move_non_jpgs_to_original_folder(folder_path):
    """
    Move non-JPG files to a subfolder named 'Original_files'.
    :param folder_path: Path to the folder to check.
    :return: None.
    """
    original_files_folder = os.path.join(folder_path, 'Original_files')
    if not os.path.exists(original_files_folder):
        os.makedirs(original_files_folder)

    for filename in os.listdir(folder_path):
        if not filename.endswith('.jpg') and not os.path.isdir(os.path.join(folder_path, filename)):
            shutil.move(os.path.join(folder_path, filename), os.path.join(original_files_folder, filename))

for subfolder in subfolders:
    current_folder = os.path.join(main_dir, subfolder)
    pdf_files = [f for f in os.listdir(current_folder) if f.endswith('.pdf')]
    
    print(f"Processing folder: A {subfolder}")
    
    for idx, filename in enumerate(pdf_files):
        pdf_path = os.path.join(current_folder, filename)
        
        # Check the file size
        file_size = os.path.getsize(pdf_path)
        if file_size > 30 * 1024 * 1024:  # Check if file size is greater than 30MB
            print(f"Skipping file {filename} due to size being greater than 30MB.")
            continue

        convert_pdf_to_images(pdf_path, current_folder, subfolder, idx)
    
    # After converting to JPGs, move non-JPGs to 'Original_files'
    move_non_jpgs_to_original_folder(current_folder)

print("Conversion and file movement complete!")


Processing folder: A 132302
Processing folder: A 132577
Processing folder: A 134934
Processing folder: A 135347
Conversion and file movement complete!
