In [1]:
import numpy as np
from PIL import Image
import os
from pathlib import Path

def calculate_rgb_stats(parent_folder):
    """
    Calculate mean and standard deviation of RGB values across an image dataset
    organized in subdirectories.
    
    Parameters:
    parent_folder (str): Path to parent folder containing subdirectories with images
    
    Returns:
    dict: Dictionary containing overall and per-folder RGB statistics
    """
    # Lists to store RGB values from all images
    all_r = []
    all_g = []
    all_b = []
    
    # Dictionary to store per-folder statistics
    folder_stats = {}
    
    # Supported image formats
    valid_formats = {'.jpg', '.jpeg', '.png', '.bmp'}
    
    # Convert to Path object for easier handling
    parent_path = Path(parent_folder)
    
    # Process each subdirectory
    for folder in parent_path.iterdir():
        if folder.is_dir():
            folder_r = []
            folder_g = []
            folder_b = []
            
            # Process each image in the current folder
            for img_path in folder.glob('**/*'):
                if img_path.suffix.lower() in valid_formats:
                    try:
                        # Load image
                        img = Image.open(img_path)
                        
                        # Convert image to RGB if it's not
                        if img.mode != 'RGB':
                            img = img.convert('RGB')
                        
                        # Convert to numpy array
                        img_array = np.array(img)
                        
                        # Append RGB values for both overall and per-folder statistics
                        r_vals = img_array[:,:,0].flatten()
                        g_vals = img_array[:,:,1].flatten()
                        b_vals = img_array[:,:,2].flatten()
                        
                        # Add to overall statistics
                        all_r.extend(r_vals)
                        all_g.extend(g_vals)
                        all_b.extend(b_vals)
                        
                        # Add to folder statistics
                        folder_r.extend(r_vals)
                        folder_g.extend(g_vals)
                        folder_b.extend(b_vals)
                        
                    except Exception as e:
                        print(f"Error processing {img_path}: {str(e)}")
            
            # Calculate statistics for this folder
            if folder_r:  # Only if folder contained valid images
                folder_stats[folder.name] = {
                    'red': {
                        'mean': float(np.mean(folder_r)),
                        'std': float(np.std(folder_r))
                    },
                    'green': {
                        'mean': float(np.mean(folder_g)),
                        'std': float(np.std(folder_g))
                    },
                    'blue': {
                        'mean': float(np.mean(folder_b)),
                        'std': float(np.std(folder_b))
                    }
                }
    
    # Calculate overall statistics
    overall_stats = {
        'red': {
            'mean': float(np.mean(all_r)) if all_r else 0,
            'std': float(np.std(all_r)) if all_r else 0
        },
        'green': {
            'mean': float(np.mean(all_g)) if all_g else 0,
            'std': float(np.std(all_g)) if all_g else 0
        },
        'blue': {
            'mean': float(np.mean(all_b)) if all_b else 0,
            'std': float(np.std(all_b)) if all_b else 0
        }
    }
    
    return {
        'overall': overall_stats,
        'per_folder': folder_stats
    }

def print_rgb_stats(stats):
    """
    Print RGB statistics for overall dataset and per folder in a formatted way.
    """
    print("\nOverall RGB Statistics:")
    print("=" * 50)
    for channel in ['red', 'green', 'blue']:
        print(f"{channel.upper()}:")
        print(f"  Mean: {stats['overall'][channel]['mean']:.2f}")
        print(f"  Std:  {stats['overall'][channel]['std']:.2f}")
    
    print("\nPer-Folder RGB Statistics:")
    print("=" * 50)
    for folder_name, folder_stats in stats['per_folder'].items():
        print(f"\n{folder_name}:")
        print("-" * 40)
        for channel in ['red', 'green', 'blue']:
            print(f"{channel.upper()}:")
            print(f"  Mean: {folder_stats[channel]['mean']:.2f}")
            print(f"  Std:  {folder_stats[channel]['std']:.2f}")



In [None]:
# Example usage
parent_folder = r"D:\Projects\SP CUP Dataset\train"
stats = calculate_rgb_stats(parent_folder)
print_rgb_stats(stats)

In [None]:
# Example usage
parent_folder = r"D:\Projects\SP CUP Dataset\valid"
stats = calculate_rgb_stats(parent_folder)
print_rgb_stats(stats)

Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0963609.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0964694.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0964880.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0968248.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0969810.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0970200.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0970516.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0970583.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0971594.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0971784.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0972229.png: 
Error processing D:\Projects\SP CUP Dataset\valid\fake\valid_fake_0972671.png: 
Error processing D:\Projects\SP CUP Data

In [None]:
def calculate_mean_std(dataset):
    loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=4)
    mean = 0.0
    std = 0.0
    total_images_count = 0

    for images, _ in loader:
        batch_samples = images.size(0)  # batch size (the last batch can have smaller size)
        images = images.view(batch_samples, images.size(1), -1)
        mean += images.mean(2).sum(0)
        std += images.std(2).sum(0)
        total_images_count += batch_samples

    mean /= total_images_count
    std /= total_images_count

    return mean, std

# Calculate mean and std
mean, std = calculate_mean_std(dataset)
print(f"Mean: {mean}")
print(f"Std: {std}")