In [1]:
import os

raw_data_path = "../datas/raw"

def count_images(path):
    total = 0
    # Dictionary to keep track of images per category (e.g., 'alert', 'drowsy')
    stats = {}
    
    for root, dirs, files in os.walk(path):
        # Count only image files
        imgs = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if imgs:
            category = os.path.basename(root)
            stats[category] = len(imgs)
            total += len(imgs)
            
    return total, stats

total_count, category_stats = count_images(raw_data_path)
print(f"Total Images Found: {total_count}")
print("Breakdown per category:", category_stats)

Total Images Found: 5936
Breakdown per category: {'Opened': 2866, 'Closed': 3070}


In [2]:
import os
import shutil
import random

# Paths
base_path = "../datas/raw"
categories = {
    'Opened': 'opened_1',
    'Closed': 'closed_1'
}

# Number of samples per category
sample_size = 2000

for original_folder, new_folder in categories.items():
    src_dir = os.path.join(base_path, original_folder)
    dst_dir = os.path.join(base_path, new_folder)
    
    # Create the new directory if it doesn't exist
    os.makedirs(dst_dir, exist_ok=True)
    
    # Get all image files
    all_imgs = [f for f in os.listdir(src_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    # Select 2000 random images
    subset_imgs = random.sample(all_imgs, sample_size)
    
    # Copy images to the new folder
    for img_name in subset_imgs:
        shutil.copy(os.path.join(src_dir, img_name), os.path.join(dst_dir, img_name))

print(f"Done! Created {sample_size} images in both 'opened_1' and 'closed_1'.")

Done! Created 2000 images in both 'opened_1' and 'closed_1'.


In [3]:
import os

base_path = "../datas/raw"
subset_folders = ['opened_1', 'closed_1']

print("--- Subset Verification ---")
for folder in subset_folders:
    folder_path = os.path.join(base_path, folder)
    
    if os.path.exists(folder_path):
        # Count files with image extensions
        files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        print(f"Folder: {folder:10} | Images found: {len(files)}")
    else:
        print(f"Folder: {folder:10} | Error: Folder not found!")

--- Subset Verification ---
Folder: opened_1   | Images found: 2000
Folder: closed_1   | Images found: 2000


In [4]:
from PIL import Image
import os

base_path = "../datas/raw"
subsets = ['opened_1', 'closed_1']

def verify_images(folder_list):
    corrupt_files = []
    total_scanned = 0
    
    for folder in folder_list:
        folder_path = os.path.join(base_path, folder)
        print(f"Scanning {folder}...")
        
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(folder_path, filename)
                total_scanned += 1
                try:
                    with Image.open(file_path) as img:
                        img.verify()  # Check if the file is broken
                except (IOError, SyntaxError) as e:
                    print(f"Bad file detected: {file_path} --> {e}")
                    corrupt_files.append(file_path)
    
    return corrupt_files, total_scanned

bad_images, total = verify_images(subsets)

print(f"\n--- Scan Complete ---")
print(f"Total images scanned: {total}")
print(f"Total corrupt images: {len(bad_images)}")

# Optional: Uncomment the lines below to delete them automatically
# for path in bad_images:
#     os.remove(path)
#     print(f"Removed: {path}")

Scanning opened_1...
Scanning closed_1...

--- Scan Complete ---
Total images scanned: 4000
Total corrupt images: 0
