In [None]:
import numpy as np
from pathlib import Path
from PIL import Image
import pandas as pd

In [None]:
mask_paths = [
    '',
    '',
]

NUCLEI_COLOR_MAP = {
    'Background': (255, 255, 255),    # White
    'Negative': (112, 112, 225),      # Blue
    'Positive': (250, 62, 62),        # Red
    'Boundary': (0, 0, 0),            # Black
}

AREA_COLOR_MAP = {
    'Background': (0, 0, 0),          # Black
    'Cancer': (245, 66, 66),          # Red
    'Other Tissue': (66, 135, 245),   # Blue
}

def detect_dataset_type(mask_path):
    """Detect whether this is nuclei or area segmentation dataset."""
    path_lower = str(mask_path).lower()
    if 'nuclei' in path_lower:
        return 'nuclei'
    elif 'area' in path_lower:
        return 'area'
    else:
        return 'nuclei'

In [None]:
def analyze_mask_statistics(mask_folder_path, dataset_type='area'):
    mask_path = Path(mask_folder_path)
    
    if not mask_path.exists():
        print(f"Error: Path '{mask_path}' does not exist.")
        return
    
    if dataset_type == 'area':
        color_map = AREA_COLOR_MAP
        class_names = ['Background', 'Cancer', 'Other Tissue']
    else: 
        color_map = NUCLEI_COLOR_MAP
        class_names = ['Background', 'Negative', 'Positive', 'Boundary']
    
    subfolders = [d for d in mask_path.iterdir() if d.is_dir()]
    
    if not subfolders:
        print("No subfolders found in mask directory.")
        return
    
    results = []
    
    for subfolder in sorted(subfolders):
        mask_files = list(subfolder.glob('*.png'))
        
        if not mask_files:
            continue
        
        class_pixels = {name: 0 for name in class_names}
        
        for mask_file in mask_files:
            img = Image.open(mask_file).convert('RGB')
            img_array = np.array(img)
            
            for class_name, color in color_map.items():
                mask = (img_array[:, :, 0] == color[0]) & \
                       (img_array[:, :, 1] == color[1]) & \
                       (img_array[:, :, 2] == color[2])
                class_pixels[class_name] += np.sum(mask)
        
        total_pixels = sum(class_pixels.values())
        
        if total_pixels > 0:
            result = {
                'Subfolder': subfolder.name,
                'Images': len(mask_files),
                'Total Pixels': total_pixels
            }
            
            for class_name in class_names:
                result[f'{class_name} (%)'] = (class_pixels[class_name] / total_pixels) * 100
                result[f'{class_name}_pixels'] = class_pixels[class_name]
            
            results.append(result)
    
    return pd.DataFrame(results)


def analyze_multiple_mask_folders(mask_paths_list):
    all_results = []
    
    for mask_path in mask_paths_list:
        print(f"Analyzing: {mask_path}")
        
        dataset_type = detect_dataset_type(mask_path)
        print(f"Dataset type: {dataset_type}")
        
        df = analyze_mask_statistics(mask_path, dataset_type=dataset_type)
        
        if df is not None and not df.empty:
            df['Source'] = Path(mask_path).name
            df['Type'] = dataset_type
            all_results.append(df)
            print(f" Found {len(df)} subfolders with {df['Images'].sum()} total images")
        else:
            print(f" No data found in {mask_path}")
    
    if not all_results:
        print("\n No data found in any of the provided paths")
        return None
    
    combined_df = pd.concat(all_results, ignore_index=True)
    
    print(f"Combined analysis: {len(all_results)} folders analyzed\n")

    print(f"Total subfolders: {len(combined_df)}")
    print(f"Total images: {combined_df['Images'].sum()}")
    print(f"Total pixels: {combined_df['Total Pixels'].sum():,}")
    
    return combined_df

df_stats = analyze_multiple_mask_folders(mask_paths)

if df_stats is not None and not df_stats.empty:
    print("Class Distribution Statistics:\n")
    display_cols = ['Source', 'Type', 'Subfolder', 'Images', 'Total Pixels']
    pct_cols = [col for col in df_stats.columns if '(%)' in col]
    print(df_stats[display_cols + pct_cols].to_string(index=False))

In [None]:
if df_stats is not None and not df_stats.empty:
    print("Detailed statistics by source and subfolder:\n")
    
    for source in df_stats['Source'].unique():
        source_df = df_stats[df_stats['Source'] == source]
        dataset_type = source_df['Type'].iloc[0]
        
        print(f"\n Source: {source} (Type: {dataset_type})")
        print("-" * 80)
        
        pct_cols = [col for col in source_df.columns if '(%)' in col]
        pixel_cols = [col for col in source_df.columns if col.endswith('_pixels')]
        
        for idx, row in source_df.iterrows():
            print(f"   {row['Subfolder']}")
            print(f"     Images: {row['Images']}")
            
            for pct_col in pct_cols:
                class_name = pct_col.replace(' (%)', '')
                print(f"     {class_name:20s}: {row[pct_col]:.2f}%")
            
            print(f"     Total Pixels:        {row['Total Pixels']:,}")
            print()
    
    print("Overall statistics by dataset type:\n")
    
    for dataset_type in df_stats['Type'].unique():
        type_df = df_stats[df_stats['Type'] == dataset_type]
        
        total_images = type_df['Images'].sum()
        total_pixels = type_df['Total Pixels'].sum()
        
        print(f"\n {dataset_type.upper()} Dataset:")
        print(f"   Subfolders: {len(type_df)}")
        print(f"   Total Images: {total_images}")
        print(f"   Total Pixels: {total_pixels:,}")
        
        pixel_cols = [col for col in type_df.columns if col.endswith('_pixels')]
        
        for pixel_col in pixel_cols:
            class_name = pixel_col.replace('_pixels', '')
            total_class_pixels = type_df[pixel_col].sum()
            pct = (total_class_pixels / total_pixels) * 100 if total_pixels > 0 else 0
            print(f"   {class_name:20s}: {pct:.2f}%")
    
    print("Overall statistics (all datasets combined):\n")

    print(f"Total Sources: {df_stats['Source'].nunique()}")
    print(f"Total Dataset Types: {df_stats['Type'].nunique()}")
    print(f"Total Subfolders: {len(df_stats)}")
    print(f"Total Images: {df_stats['Images'].sum()}")
    print(f"Total Pixels: {df_stats['Total Pixels'].sum():,}")
    
    print("Statistics by source:")
    for source in df_stats['Source'].unique():
        source_df = df_stats[df_stats['Source'] == source]
        source_total_pixels = source_df['Total Pixels'].sum()
        dataset_type = source_df['Type'].iloc[0]
        
        print(f"\n {source} ({dataset_type})")
        print(f"   Subfolders: {len(source_df)}")
        print(f"   Images: {source_df['Images'].sum()}")
        print(f"   Pixels: {source_total_pixels:,}")
        
        pixel_cols = [col for col in source_df.columns if col.endswith('_pixels')]
        
        for pixel_col in pixel_cols:
            class_name = pixel_col.replace('_pixels', '')
            total_class_pixels = source_df[pixel_col].sum()
            pct = (total_class_pixels / source_total_pixels) * 100 if source_total_pixels > 0 else 0
            print(f"   {class_name:20s}: {pct:.2f}%")
else:
    print("No statistics available to display.")

In [None]:
def analyze_by_dataset_split(df_stats, train_subfolders=None, val_subfolders=None):
    if df_stats is None or df_stats.empty:
        raise ValueError("df_stats is empty or None")

    if train_subfolders is None and val_subfolders is None:
        raise ValueError("At least one of train_subfolders or val_subfolders must be provided")

    pixel_cols = [col for col in df_stats.columns if col.endswith('_pixels')]
    pct_cols = [col for col in df_stats.columns if '(%)' in col]
    
    df = df_stats.copy()
    
    all_subfolders = set(df['Subfolder'].values)
    
    if train_subfolders is not None:
        train_set = set(train_subfolders)
        invalid = train_set - all_subfolders
        if invalid:
            raise ValueError(f"Train subfolders not found in dataset: {invalid}")
    else:
        train_set = set()
    
    if val_subfolders is not None:
        val_set = set(val_subfolders)
        invalid = val_set - all_subfolders
        if invalid:
            raise ValueError(f"Validation subfolders not found in dataset: {invalid}")
    else:
        val_set = set()
    
    overlap = train_set & val_set
    if overlap:
        raise ValueError(f"Subfolders appear in both train and validation: {overlap}")
    
    if train_subfolders is not None and val_subfolders is None:
        val_set = all_subfolders - train_set
    elif val_subfolders is not None and train_subfolders is None:
        train_set = all_subfolders - val_set
    
    assignment = ['Train' if name in train_set else 'Validation' for name in df['Subfolder']]
    
    base_cols = ['Subfolder', 'Images', 'Total Pixels']
    if 'Source' in df.columns:
        base_cols.insert(0, 'Source')
    if 'Type' in df.columns:
        base_cols.insert(1, 'Type')
    
    df_result = df[base_cols + pct_cols].copy()
    df_result['Assigned Set'] = assignment

    train_df = df_result[df_result['Assigned Set'] == 'Train']
    val_df = df_result[df_result['Assigned Set'] == 'Validation']

    def aggregate_stats(group_df, split_name):
        if group_df.empty:
            return {'Split': split_name, 'Total Images': 0, 'Total Pixels': 0}
        
        total_pixels = group_df['Total Pixels'].sum()
        total_images = group_df['Images'].sum()
        
        result = {
            'Split': split_name,
            'Total Images': int(total_images),
            'Total Pixels': int(total_pixels)
        }
        
        idxs = group_df.index
        original_rows = df.loc[idxs]
        
        for pixel_col in pixel_cols:
            if pixel_col in original_rows.columns:
                class_name = pixel_col.replace('_pixels', '')
                total_class_pixels = original_rows[pixel_col].sum()
                pct = (total_class_pixels / total_pixels) * 100 if total_pixels > 0 else 0.0
                result[f'{class_name} (%)'] = pct
        
        return result

    train_stats = aggregate_stats(train_df, 'Train')
    val_stats = aggregate_stats(val_df, 'Validation')

    print("\nDataset split summary:")
    print(f"  Train: {train_stats['Total Images']} images, {train_stats['Total Pixels']:,} pixels")
    for key, val in train_stats.items():
        if key.endswith('(%)'):
            print(f"    {key}: {val:.2f}%", end='')
    print()
    
    print(f"Validation: {val_stats['Total Images']} images, {val_stats['Total Pixels']:,} pixels")
    for key, val in val_stats.items():
        if key.endswith('(%)'):
            print(f"    {key}: {val:.2f}%", end='')
    print()

    summary_df = pd.DataFrame([train_stats, val_stats])

    return df_result.reset_index(drop=True), summary_df


In [None]:
print("Available subfolders by source:\n")
for mask_path in mask_paths:
    path = Path(mask_path)
    if path.exists():
        print(f"{path.name}:")
        for folder in sorted(path.iterdir()):
            if folder.is_dir():
                print(f"   - {folder.name}")
        print()
    else:
        print(f"Path does not exist: {mask_path}\n")

In [None]:
train_cases = []
val_cases = []

assigned_df, df_split_stats = analyze_by_dataset_split(df_stats, train_subfolders=train_cases, val_subfolders=val_cases)

print("\nDetailed assignment:")
print(assigned_df.to_string(index=False))

print("Split statistics:")
print(df_split_stats.to_string(index=False))