In [1]:
import os
import shutil
import pytesseract
from PIL import Image

# Update the Tesseract path if needed
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

def filter_images_with_text(source_directory, destination_base_directory, images_per_set=1000, total_images_needed=4000):
    # Ensure destination base directory exists, if not create it
    if not os.path.exists(destination_base_directory):
        os.makedirs(destination_base_directory)
    
    # Create subdirectories to store images
    num_sets = total_images_needed // images_per_set
    destination_directories = [os.path.join(destination_base_directory, f'set_{i+1}') for i in range(num_sets)]
    for dest_dir in destination_directories:
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
    
    # Get a list of files in the directory
    files = [f for f in os.listdir(source_directory) if os.path.isfile(os.path.join(source_directory, f))]
    
    count = 0
    set_index = 0
    
    for file in files:
        if count >= total_images_needed:
            break

        file_path = os.path.join(source_directory, file)
        try:
            img = Image.open(file_path)
            text = pytesseract.image_to_string(img)
            if not text.strip():  # If no text found, move the image
                destination_path = os.path.join(destination_directories[set_index], file)
                shutil.copy(file_path, destination_path)
                count += 1
                
                # Switch to the next set if this set is full
                if count % images_per_set == 0:
                    set_index += 1

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Set the path to the directories
source_directory_path = r"C:/Users/meena/Downloads/LLD-logo_files (1)/LLD-logo-files"
destination_directory_path = r"C:/Users/meena/Downloads/cleaned_LLD_dataset"

# Filter out images with text
filter_images_with_text(source_directory_path, destination_directory_path)