In [38]:
import os
from PIL import Image, ImageStat
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [23]:
def is_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()  # Verify that it is, in fact, an image
        return True
    except (IOError, SyntaxError):
        return False

In [24]:
def is_too_bright_or_too_dark(file_path, brightness_threshold=240, darkness_threshold=15):
    try:
        with Image.open(file_path) as img:
            grayscale_img = img.convert("L")  # Convert image to grayscale
            stat = ImageStat.Stat(grayscale_img)
            mean_brightness = stat.mean[0]
            
            if mean_brightness >= brightness_threshold or mean_brightness <= darkness_threshold:
                return True
        return False
    except Exception:
        return True

In [25]:
def delete_file(file_path):
    try:
        os.remove(file_path)
    except Exception as e:
        logging.error(f"Error deleting file {file_path}: {e}")

In [26]:
def process_invalid_image(file_path):
    if not is_image(file_path):
        delete_file(file_path)

In [27]:
def process_bright_or_dark_image(file_path):
    if is_too_bright_or_too_dark(file_path):
        delete_file(file_path)

In [39]:
def delete_invalid_images(directory):
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            if os.path.isfile(file_path):
                futures.append(executor.submit(process_invalid_image, file_path))
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error processing file in delete_invalid_images: {e}")



In [40]:
def delete_too_bright_or_too_dark_images(directory):
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            if os.path.isfile(file_path) and is_image(file_path):
                futures.append(executor.submit(process_bright_or_dark_image, file_path))
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error processing file in delete_too_bright_or_too_dark_images: {e}")



In [41]:
def delete_half_images(directory):
    images = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and is_image(os.path.join(directory, f))]
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(1, len(images), 2):  # Start from the second element (index 1) and step by 2
            file_path = os.path.join(directory, images[i])
            futures.append(executor.submit(delete_file, file_path))
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error processing file in delete_half_images: {e}")
    logging.info("Delete completed")



In [42]:
def clean_directory(directory):
    delete_invalid_images(directory)
    delete_too_bright_or_too_dark_images(directory)
    delete_half_images(directory)


In [43]:
directory = r'A:\AI DB\LSPD\Videos\photos_from_videos\normal_photos'  # Replace with the path to your directory
clean_directory(directory)
directory = r'A:\AI DB\LSPD\Videos\photos_from_videos\porn_photos'  # Replace with the path to your directory
clean_directory(directory)


2024-05-21 07:40:49,359 - INFO - Delete completed
2024-05-21 08:27:07,167 - INFO - Delete completed


In [44]:
import splitfolders as sf
from pathlib import Path

dataset_dir=Path("A:\AI DB\LSPD\Videos\photos_from_videos")

output_dir=Path("A:\AI DB\LSPD\Videos\photos_from_videos_pret")
output_dir.mkdir(exist_ok=True)
# split dataset 90% training 5% validation 5% testing
sf.ratio(dataset_dir,output_dir,seed=1337,ratio=(.9,.05,.05),group_prefix=None)

Copying files: 478753 files [25:09, 317.10 files/s]
