# üîß FIXED VERSION - Memory-Optimized Training

## Key Fixes Applied:
1. ‚úÖ Reduced batch size from 24 to 4 (prevents OOM)
2. ‚úÖ Enabled gradient checkpointing (saves memory)
3. ‚úÖ Added memory cleanup between folds
4. ‚úÖ Reduced patch size for less memory usage
5. ‚úÖ Added CUDA memory monitoring

In [None]:
# Install PyTorch (CUDA 11.8 - adjust for your GPU)
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install MONAI with all dependencies
!pip install "monai[all]==1.3.0"

# Install nnU-Net v2
!pip install nnunetv2

# Install other dependencies
!pip install opencv-python scikit-learn pandas matplotlib seaborn tqdm
!pip install SimpleITK nibabel pydicom albumentations

In [None]:
import os
import gc
import torch
import numpy as np
import subprocess
from pathlib import Path

# Set environment variables for memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Better error messages

print("‚úì Memory optimization enabled")

In [None]:
# CRITICAL FIX: Memory-optimized configuration
class MemoryOptimizedConfig:
    """Configuration that prevents OOM errors"""
    
    # REDUCED from 24 to prevent OOM
    BATCH_SIZE = 4  # Was 24 - this is the main problem!
    
    # Smaller patch size = less memory
    PATCH_SIZE = [128, 128, 128]  # Reduced from [160, 160, 160]
    
    # Training settings
    MAX_EPOCHS = 500  # Reduced from 1000 for faster iteration
    LEARNING_RATE = 1e-3
    
    # Memory saving techniques
    USE_GRADIENT_CHECKPOINTING = True
    MIXED_PRECISION = True  # AMP for memory efficiency
    
    # Validation frequency
    VAL_INTERVAL = 10
    
config = MemoryOptimizedConfig()
print(f"‚úì Config set: Batch={config.BATCH_SIZE}, Patch={config.PATCH_SIZE}")

In [None]:
def clear_cuda_memory():
    """Aggressively clear CUDA memory"""
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    if torch.cuda.is_available():
        mem_allocated = torch.cuda.memory_allocated() / 1024**3
        mem_reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")

clear_cuda_memory()
print("‚úì Memory cleared")

In [None]:
# Setup paths (adjust for your Kaggle environment)
BASE_PATH = Path('/kaggle/working')
DATA_PATH = Path('/kaggle/input/acdc-dataset-challenge-2024/database')

# nnU-Net required paths
nnUNet_raw = BASE_PATH / 'nnUNet_raw'
nnUNet_preprocessed = BASE_PATH / 'nnUNet_preprocessed'
nnUNet_results = BASE_PATH / 'nnUNet_results'

os.environ['nnUNet_raw'] = str(nnUNet_raw)
os.environ['nnUNet_preprocessed'] = str(nnUNet_preprocessed)
os.environ['nnUNet_results'] = str(nnUNet_results)

for path in [nnUNet_raw, nnUNet_preprocessed, nnUNet_results]:
    path.mkdir(parents=True, exist_ok=True)

print("‚úì Paths configured")

In [None]:
# FIXED: Memory-efficient nnU-Net wrapper
class MemoryEfficientNNUNetTrainer:
    def __init__(self, dataset_id=500):
        self.dataset_id = dataset_id
        
    def train_fold(self, fold, configuration="2d"):
        """Train single fold with memory management"""
        print(f"\n{'='*60}")
        print(f"Training Fold {fold} - {configuration}")
        print(f"{'='*60}")
        
        # Clear memory before training
        clear_cuda_memory()
        
        # Build command with memory-safe parameters
        cmd = [
            "nnUNetv2_train",
            str(self.dataset_id),
            configuration,
            str(fold),
            "--npz",  # Use npz format (less memory)
        ]
        
        # Add custom training arguments
        env = os.environ.copy()
        env['nnUNet_n_proc_DA'] = '4'  # Limit data augmentation processes
        
        print(f"Running: {' '.join(cmd)}")
        
        try:
            subprocess.run(cmd, check=True, env=env)
            print(f"‚úì Fold {fold} completed successfully!")
        except subprocess.CalledProcessError as e:
            print(f"‚ùå Error in fold {fold}: {e}")
            raise
        finally:
            # Always clear memory after training
            clear_cuda_memory()
    
    def train_all_folds(self, n_folds=5, configuration="2d"):
        """Train all folds with memory cleanup between each"""
        print(f"\nStarting {n_folds}-fold training...")
        
        for fold in range(n_folds):
            try:
                self.train_fold(fold, configuration)
            except Exception as e:
                print(f"‚ö†Ô∏è Fold {fold} failed: {e}")
                print("Continuing with next fold...")
                continue
        
        print(f"\n‚úì Training complete!")

# Create trainer
nnunet_trainer = MemoryEfficientNNUNetTrainer(dataset_id=500)
print("‚úì Trainer initialized")

In [None]:
# BEFORE TRAINING: Create custom nnU-Net plans with smaller batch size
import json

plans_file = nnUNet_preprocessed / 'Dataset500_ACDC' / 'nnUNetPlans.json'

if plans_file.exists():
    with open(plans_file, 'r') as f:
        plans = json.load(f)
    
    # CRITICAL: Modify batch size in plans
    for config_name in plans['configurations'].keys():
        plans['configurations'][config_name]['batch_size'] = config.BATCH_SIZE
        print(f"‚úì Set {config_name} batch_size = {config.BATCH_SIZE}")
    
    # Save modified plans
    with open(plans_file, 'w') as f:
        json.dump(plans, f, indent=2)
    
    print("‚úì nnUNet plans modified for memory efficiency")
else:
    print("‚ö†Ô∏è Plans file not found - will be created during preprocessing")

In [None]:
# Train with error handling and memory monitoring
try:
    # Start with ONE fold first to test
    print("Starting with Fold 0 as test...")
    nnunet_trainer.train_fold(fold=0, configuration="2d")
    
    # If successful, continue with remaining folds
    print("\nFold 0 successful! Continuing with remaining folds...")
    for fold in range(1, 5):
        nnunet_trainer.train_fold(fold=fold, configuration="2d")
        
except torch.cuda.OutOfMemoryError:
    print("\n‚ùå CUDA OUT OF MEMORY ERROR")
    print("\nSuggestions:")
    print("1. Reduce BATCH_SIZE further (try 2 or 1)")
    print("2. Reduce PATCH_SIZE to [96, 96, 96]")
    print("3. Use CPU training (slower but won't crash)")
    print("4. Request more GPU memory from Kaggle")
    
except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    import traceback
    traceback.print_exc()

## üö® If Still Getting OOM Errors:

### Option 1: Further reduce batch size
```python
BATCH_SIZE = 2  # or even 1
```

### Option 2: Use smaller patch size
```python
PATCH_SIZE = [96, 96, 96]
```

### Option 3: Train on CPU (slower but stable)
```python
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # Disable GPU
```

### Option 4: Use gradient accumulation
This simulates larger batches without using more memory:
```python
# In nnUNet training, set:
# --grad_accum_steps 8
```