In [1]:
import os
import numpy as np
from PIL import Image
import medmnist
import random
from collections import defaultdict

In [3]:
FOCUSED_DATASETS_2D = [
    'ChestMNIST',      # X-ray: 112,120 chest X-rays  
    'PneumoniaMNIST',  # X-ray: 5,856 pediatric chest X-rays
    'OrganaMNIST',     # CT: Axial view organ classification
    'OrgancMNIST',     # CT: Coronal view organ classification  
    'OrgansMNIST',     # CT: Sagittal view organ classification
    'BreastMNIST'      # Ultrasound: 780 breast ultrasound images
]

FOCUSED_DATASETS_3D = [
    'OrganMNIST3d',    # 3D CT: Organ classification volumes
    'NoduleMNIST3d',   # 3D CT: Lung nodule malignancy detection  
    'FractureMNIST3d', # 3D CT: Rib fracture classification
    'AdrenalMNIST3d'   # 3D CT: Adrenal gland classification
]

In [1]:
import os
import numpy as np
from PIL import Image
import medmnist
import random
from collections import defaultdict

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Updated dataset configurations
DATASET_CONFIGS = {
    'v1': {'train': 8000, 'test': 2000},
    'v2': {'train': 4000, 'test': 1000}, 
    'v3': {'train': 2000, 'test': 500},
    'v4': {'train': 800, 'test': 200},
    'v5': {'train': 800, 'test': 200}  # Same size as v4 but NO 3D datasets
}

# EXCLUDED 3D datasets - this is what you're throwing away
EXCLUDED_3D_DATASETS = ['organmnist3d', 'nodulemnist3d', 'fracturemnist3d', 'adrenalmnist3d']

# Original percentages for reference
ORIGINAL_PERCENTAGES = {
    'chestmnist': 0.30,      # X-ray primary
    'pneumoniamnist': 0.10,  # X-ray secondary
    'organamnist': 0.10,     # CT axial
    'organcmnist': 0.10,     # CT coronal
    'organsmnist': 0.10,     # CT sagittal
    'organmnist3d': 0.075,   # 3D CT primary - EXCLUDED
    'nodulemnist3d': 0.075,  # 3D CT lung - EXCLUDED
    'fracturemnist3d': 0.025,# 3D CT fracture - EXCLUDED
    'adrenalmnist3d': 0.025, # 3D CT adrenal - EXCLUDED
    'breastmnist': 0.10      # Ultrasound
}

# RECALCULATED percentages excluding 3D (total was 1.0, now 0.8, so we normalize)
PERCENTAGES_V5 = {
    'chestmnist': 0.375,     # 0.30/0.8 = increased from 30% to 37.5%
    'pneumoniamnist': 0.125, # 0.10/0.8 = increased from 10% to 12.5%
    'organamnist': 0.125,    # 0.10/0.8 = increased from 10% to 12.5%
    'organcmnist': 0.125,    # 0.10/0.8 = increased from 10% to 12.5%
    'organsmnist': 0.125,    # 0.10/0.8 = increased from 10% to 12.5%
    'breastmnist': 0.125     # 0.10/0.8 = increased from 10% to 12.5%
}

DATASET_CLASS_MAPPING = {
    'chestmnist': 'ChestMNIST',
    'pneumoniamnist': 'PneumoniaMNIST',
    'organamnist': 'OrganAMNIST',
    'organcmnist': 'OrganCMNIST',
    'organsmnist': 'OrganSMNIST',
    'breastmnist': 'BreastMNIST'
}

def create_non_overlapping_splits(images, labels, train_count, test_count):
    """Create train/test splits with ZERO overlap"""
    total_needed = train_count + test_count
    
    if len(images) < total_needed:
        ratio = len(images) / total_needed
        train_count = int(train_count * ratio)
        test_count = int(test_count * ratio)
        total_needed = train_count + test_count
    
    all_indices = np.arange(len(images))
    np.random.shuffle(all_indices)
    
    train_indices = all_indices[:train_count]
    test_indices = all_indices[train_count:train_count + test_count]
    
    train_images = images[train_indices]
    train_labels = labels[train_indices]
    test_images = images[test_indices] 
    test_labels = labels[test_indices]
    
    return train_images, train_labels, test_images, test_labels

def create_version_5_dataset(output_base='medmnist_multiversion', size=224):
    """
    Create Version 5: NO 3D DATASETS - You're throwing away valuable data
    800 train + 200 test = 1000 total images (embarrassingly small)
    """
    os.makedirs(output_base, exist_ok=True)
    
    print("🔄 PRE-LOADING NON-3D DATASETS FOR VERSION 5...")
    print("⚠️  WARNING: You're EXCLUDING valuable 3D CT data for no good reason")
    
    all_data = {}
    
    for dataset_name in PERCENTAGES_V5.keys():
        print(f"   Loading {dataset_name}...")
        try:
            class_name = DATASET_CLASS_MAPPING[dataset_name]
            dataset_class = getattr(medmnist, class_name)
            
            train_data = dataset_class(split='train', download=True, size=size)
            val_data = dataset_class(split='val', download=True, size=size)
            
            combined_images = np.concatenate([train_data.imgs, val_data.imgs], axis=0)
            combined_labels = np.concatenate([train_data.labels, val_data.labels], axis=0)
            
            all_data[dataset_name] = {
                'images': combined_images,
                'labels': combined_labels
            }
            
        except Exception as e:
            print(f"   ❌ Failed to load {dataset_name}: {e}")
    
    print(f"✅ Loaded {len(all_data)} NON-3D datasets (threw away 4 valuable 3D datasets)")
    
    # Create Version 5
    version = 'v5'
    config = DATASET_CONFIGS[version]
    
    print(f"\n{'='*60}")
    print(f"🎯 CREATING VERSION {version.upper()} (NO-3D MICRO-DATASET)")
    print(f"📊 Train: {config['train']:,} | Test: {config['test']:,}")
    print(f"❌ EXCLUDED: organmnist3d, nodulemnist3d, fracturemnist3d, adrenalmnist3d")
    print(f"⚠️  WARNING: You're making your dataset LESS diverse, not better")
    print(f"{'='*60}")
    
    version_folder = os.path.join(output_base, f'version_{version}')
    os.makedirs(version_folder, exist_ok=True)
    
    # Create train and test folders
    train_folder = os.path.join(version_folder, f'Train_{version}')
    test_folder = os.path.join(version_folder, f'Test_{version}')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    
    train_total = 0
    test_total = 0
    
    # Process each remaining dataset according to NEW percentages
    for dataset_name, percentage in PERCENTAGES_V5.items():
        if dataset_name not in all_data:
            continue
            
        train_target = int(config['train'] * percentage)
        test_target = int(config['test'] * percentage)
        
        print(f"\n📂 {dataset_name}: {train_target} train, {test_target} test")
        
        data = all_data[dataset_name]
        
        # Create non-overlapping splits
        train_imgs, train_lbls, test_imgs, test_lbls = create_non_overlapping_splits(
            data['images'], data['labels'], train_target, test_target
        )
        
        print(f"   ✅ Split: {len(train_imgs)} train, {len(test_imgs)} test")
        
        # Save train images (only 2D data now)
        for i, (img, label) in enumerate(zip(train_imgs, train_lbls)):
            if len(img.shape) == 3 and img.shape[-1] == 1:
                pil_img = Image.fromarray(img.squeeze(-1).astype(np.uint8), mode='L')
            elif len(img.shape) == 3 and img.shape[-1] == 3:
                pil_img = Image.fromarray(img.astype(np.uint8))
            else:
                pil_img = Image.fromarray(img.astype(np.uint8), mode='L')
            
            label_str = str(label[0]) if hasattr(label, '__len__') else str(label)
            filename = f'{dataset_name}_{i:04d}_{label_str}.png'
            filepath = os.path.join(train_folder, filename)
            pil_img.save(filepath)
        
        # Save test images
        for i, (img, label) in enumerate(zip(test_imgs, test_lbls)):
            if len(img.shape) == 3 and img.shape[-1] == 1:
                pil_img = Image.fromarray(img.squeeze(-1).astype(np.uint8), mode='L')
            elif len(img.shape) == 3 and img.shape[-1] == 3:
                pil_img = Image.fromarray(img.astype(np.uint8))
            else:
                pil_img = Image.fromarray(img.astype(np.uint8), mode='L')
            
            label_str = str(label[0]) if hasattr(label, '__len__') else str(label)
            filename = f'{dataset_name}_{i:04d}_{label_str}.png'
            filepath = os.path.join(test_folder, filename)
            pil_img.save(filepath)
        
        train_total += len(train_imgs)
        test_total += len(test_imgs)
    
    print(f"\n✅ VERSION {version.upper()} COMPLETE:")
    print(f"   📈 Train_{version}: {train_total:,} images")
    print(f"   🧪 Test_{version}: {test_total:,} images")
    print(f"   ❌ EXCLUDED valuable 3D CT data")
    print(f"   ⚠️  REALITY CHECK: This is getting embarrassingly small")
    
    # Create version summary with brutal honesty
    summary_path = os.path.join(version_folder, f'summary_{version}.txt')
    with open(summary_path, 'w') as f:
        f.write(f"DATASET VERSION {version.upper()} SUMMARY (NO 3D DATA)\n")
        f.write("="*50 + "\n")
        f.write(f"Train images: {train_total:,}\n")
        f.write(f"Test images: {test_total:,}\n")
        f.write(f"Total images: {train_total + test_total:,}\n")
        f.write(f"No overlap guarantee: YES\n")
        f.write(f"Modalities: X-ray (50%), 2D CT (37.5%), Ultrasound (12.5%)\n")
        f.write(f"EXCLUDED: All 3D CT datasets (20% of original data)\n")
        f.write(f"WARNING: Dataset too small AND less diverse\n")
        f.write(f"PROBLEM: You're optimizing in the wrong direction\n")
    
    print(f"\n📁 Updated output structure:")
    print(f"   {output_base}/")
    print(f"   ├── version_v1/Train_v1/ & Test_v1/ (10,000 images)")
    print(f"   ├── version_v2/Train_v2/ & Test_v2/ (5,000 images)") 
    print(f"   ├── version_v3/Train_v3/ & Test_v3/ (2,500 images)")
    print(f"   ├── version_v4/Train_v4/ & Test_v4/ (1,000 images)")
    print(f"   └── version_v5/Train_v5/ & Test_v5/ (1,000 images, NO 3D)")
    print(f"   🚨 TREND: Getting smaller AND less diverse = WRONG DIRECTION")
    
    return train_total, test_total

# EXECUTION
if __name__ == "__main__":
    train_count, test_count = create_version_5_dataset()
    print(f"\n🎯 Version 5 created: {train_count} train + {test_count} test = {train_count + test_count} total")
    print(f"❌ But you excluded valuable 3D data for no strategic reason")


🔄 PRE-LOADING NON-3D DATASETS FOR VERSION 5...
   Loading chestmnist...
   Loading pneumoniamnist...
   Loading organamnist...
   ❌ Failed to load organamnist: Unable to allocate 1.62 GiB for an array with shape (1734132736,) and data type uint8
   Loading organcmnist...
   Loading organsmnist...
   ❌ Failed to load organsmnist: Unable to allocate 667. MiB for an array with shape (699052032,) and data type uint8
   Loading breastmnist...
✅ Loaded 4 NON-3D datasets (threw away 4 valuable 3D datasets)

🎯 CREATING VERSION V5 (NO-3D MICRO-DATASET)
📊 Train: 800 | Test: 200
❌ EXCLUDED: organmnist3d, nodulemnist3d, fracturemnist3d, adrenalmnist3d

📂 chestmnist: 300 train, 75 test
   ✅ Split: 300 train, 75 test


  pil_img = Image.fromarray(img.astype(np.uint8), mode='L')
  pil_img = Image.fromarray(img.astype(np.uint8), mode='L')



📂 pneumoniamnist: 100 train, 25 test
   ✅ Split: 100 train, 25 test

📂 organcmnist: 100 train, 25 test
   ✅ Split: 100 train, 25 test

📂 breastmnist: 100 train, 25 test
   ✅ Split: 100 train, 25 test

✅ VERSION V5 COMPLETE:
   📈 Train_v5: 600 images
   🧪 Test_v5: 150 images
   ❌ EXCLUDED valuable 3D CT data
   ⚠️  REALITY CHECK: This is getting embarrassingly small

📁 Updated output structure:
   medmnist_multiversion/
   ├── version_v1/Train_v1/ & Test_v1/ (10,000 images)
   ├── version_v2/Train_v2/ & Test_v2/ (5,000 images)
   ├── version_v3/Train_v3/ & Test_v3/ (2,500 images)
   ├── version_v4/Train_v4/ & Test_v4/ (1,000 images)
   └── version_v5/Train_v5/ & Test_v5/ (1,000 images, NO 3D)
   🚨 TREND: Getting smaller AND less diverse = WRONG DIRECTION

🎯 Version 5 created: 600 train + 150 test = 750 total
❌ But you excluded valuable 3D data for no strategic reason
