# Preprocessing Pipeline Validation

This notebook validates the preprocessing pipeline by comparing raw and preprocessed volumes.

## Objectives
- Load preprocessed volumes generated by `prepare_data.py`
- Compare raw vs preprocessed volumes
- Visualize the effect of resampling
- Verify normalization (HU clamping and scaling)


In [None]:
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Optional

sys.path.insert(0, str(Path().absolute().parent))

from src.data import preprocessing


## 1. Load Raw and Preprocessed Volumes

Load both the original raw volume and its preprocessed version for comparison.


In [None]:
# Configuration: specify paths to your data
# Raw volume path (original, before preprocessing)
raw_volume_path = Path("../data/samples/exp_volume_0.nii.gz")

# Preprocessed volume path (generated by prepare_data.py)
preprocessed_volume_path = Path("../data/processed/exp_volume_0_preprocessed.nii.gz")

# Load raw volume
print("Loading raw volume...")
if raw_volume_path.is_dir():
    raw_volume, raw_metadata = preprocessing.load_dicom_volume(raw_volume_path)
else:
    raw_volume, raw_metadata = preprocessing.load_nifti_volume(raw_volume_path)

print(f"Raw volume loaded:")
print(f"  Shape: {raw_volume.shape}")
print(f"  Spacing: {raw_metadata['spacing']}")
print(f"  Data type: {raw_volume.dtype}")
print(f"  Intensity range: [{raw_volume.min():.2f}, {raw_volume.max():.2f}]")

# Load preprocessed volume
print("\nLoading preprocessed volume...")
if preprocessed_volume_path.exists():
    preprocessed_volume, preprocessed_metadata = preprocessing.load_nifti_volume(preprocessed_volume_path)
    
    print(f"Preprocessed volume loaded:")
    print(f"  Shape: {preprocessed_volume.shape}")
    print(f"  Spacing: {preprocessed_metadata['spacing']}")
    print(f"  Data type: {preprocessed_volume.dtype}")
    print(f"  Intensity range: [{preprocessed_volume.min():.2f}, {preprocessed_volume.max():.2f}]")
else:
    print(f"ERROR: Preprocessed volume not found at {preprocessed_volume_path}")
    print("Please run prepare_data.py first to generate preprocessed volumes.")
    preprocessed_volume = None
    preprocessed_metadata = None


## 2. Compare Raw vs Preprocessed

Compare key characteristics between raw and preprocessed volumes.


In [None]:
def compare_volumes(raw_vol: np.ndarray, raw_meta: dict, preprocessed_vol: np.ndarray, preprocessed_meta: dict):
    """Compare raw and preprocessed volumes."""
    print("=" * 60)
    print("COMPARISON: Raw vs Preprocessed")
    print("=" * 60)
    
    print("\n1. Shape Comparison:")
    print(f"   Raw:         {raw_vol.shape}")
    print(f"   Preprocessed: {preprocessed_vol.shape}")
    print(f"   Change:      {np.array(preprocessed_vol.shape) - np.array(raw_vol.shape)}")
    
    print("\n2. Spacing Comparison:")
    print(f"   Raw:         {raw_meta['spacing']}")
    print(f"   Preprocessed: {preprocessed_meta['spacing']}")
    print(f"   Target:      (1.0, 1.0, 1.0) [isotropic]")
    
    print("\n3. Intensity Statistics:")
    print(f"   Raw - Min: {raw_vol.min():.2f}, Max: {raw_vol.max():.2f}, Mean: {raw_vol.mean():.2f}")
    print(f"   Preprocessed - Min: {preprocessed_vol.min():.2f}, Max: {preprocessed_vol.max():.2f}, Mean: {preprocessed_vol.mean():.2f}")
    
    print("\n4. Memory Usage:")
    raw_size_mb = raw_vol.nbytes / (1024 * 1024)
    preprocessed_size_mb = preprocessed_vol.nbytes / (1024 * 1024)
    print(f"   Raw:         {raw_size_mb:.2f} MB")
    print(f"   Preprocessed: {preprocessed_size_mb:.2f} MB")
    print(f"   Change:      {preprocessed_size_mb - raw_size_mb:.2f} MB ({((preprocessed_size_mb / raw_size_mb - 1) * 100):.1f}%)")
    
    # Check if preprocessing metadata is available
    if "original_shape" in preprocessed_meta:
        print("\n5. Preprocessing Steps Applied:")
        print(f"   Original shape: {preprocessed_meta['original_shape']}")
        print(f"   Original spacing: {preprocessed_meta['original_spacing']}")
        if "crop_slices" in preprocessed_meta:
            print(f"   Cropping applied: {preprocessed_meta['crop_slices']}")
        if "hu_range" in preprocessed_meta:
            print(f"   HU range: {preprocessed_meta['hu_range']}")
    
    print("=" * 60)

if preprocessed_volume is not None:
    compare_volumes(raw_volume, raw_metadata, preprocessed_volume, preprocessed_metadata)
else:
    print("Cannot compare: preprocessed volume not loaded.")


## 3. Interactive 3D Comparison with Napari

Use Napari to interactively compare raw and preprocessed volumes side-by-side.


In [None]:
# Interactive 3D comparison with Napari
# This allows you to scroll through both volumes simultaneously
# and see the effects of preprocessing in real-time

if preprocessed_volume is not None:
    print("Opening Napari viewer for raw vs preprocessed comparison...")
    print("Instructions:")
    print("  - Use mouse wheel to scroll through slices")
    print("  - Adjust layer opacity to blend between raw and preprocessed")
    print("  - Toggle layers on/off to compare")
    print("  - Use contrast limits to adjust brightness/contrast")
    
    viewer = napari.Viewer(title="Raw vs Preprocessed Volume Comparison")
    
    # Add raw volume
    viewer.add_image(
        raw_volume,
        name="Raw Volume",
        colormap="gray",
        contrast_limits=[raw_volume.min(), raw_volume.max()],
        opacity=0.7,
    )
    
    # Add preprocessed volume
    viewer.add_image(
        preprocessed_volume,
        name="Preprocessed Volume",
        colormap="gray",
        contrast_limits=[preprocessed_volume.min(), preprocessed_volume.max()],
        opacity=0.7,
    )
    
    # Set spacing for correct aspect ratio
    raw_spacing = raw_metadata.get("spacing", [1.0, 1.0, 1.0])
    prep_spacing = preprocessed_metadata.get("spacing", [1.0, 1.0, 1.0])
    viewer.layers["Raw Volume"].scale = raw_spacing
    viewer.layers["Preprocessed Volume"].scale = prep_spacing
    
    print("\nNapari viewer opened!")
    print("You can now compare raw and preprocessed volumes interactively.")
    print("Close the viewer window when done.")
    
    napari.run()
else:
    print("Preprocessed volume not available. Skipping Napari visualization.")


In [None]:
def visualize_resampling_effect(raw_vol: np.ndarray, preprocessed_vol: np.ndarray):
    """Visualize the effect of resampling on volume appearance."""
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle("Resampling Effect: Raw vs Preprocessed", fontsize=16)
    
    # Get middle slices for both volumes
    raw_d, raw_h, raw_w = raw_vol.shape
    prep_d, prep_h, prep_w = preprocessed_vol.shape
    
    # Axial slices
    axes[0, 0].imshow(raw_vol[raw_d//2, :, :], cmap="gray")
    axes[0, 0].set_title(f"Raw - Axial (Slice {raw_d//2}/{raw_d-1})\nShape: {raw_vol.shape}")
    axes[0, 0].axis("off")
    
    axes[1, 0].imshow(preprocessed_vol[prep_d//2, :, :], cmap="gray")
    axes[1, 0].set_title(f"Preprocessed - Axial (Slice {prep_d//2}/{prep_d-1})\nShape: {preprocessed_vol.shape}")
    axes[1, 0].axis("off")
    
    # Coronal slices
    axes[0, 1].imshow(raw_vol[:, raw_h//2, :], cmap="gray")
    axes[0, 1].set_title(f"Raw - Coronal (Slice {raw_h//2}/{raw_h-1})")
    axes[0, 1].axis("off")
    
    axes[1, 1].imshow(preprocessed_vol[:, prep_h//2, :], cmap="gray")
    axes[1, 1].set_title(f"Preprocessed - Coronal (Slice {prep_h//2}/{prep_h-1})")
    axes[1, 1].axis("off")
    
    # Sagittal slices
    axes[0, 2].imshow(raw_vol[:, :, raw_w//2], cmap="gray")
    axes[0, 2].set_title(f"Raw - Sagittal (Slice {raw_w//2}/{raw_w-1})")
    axes[0, 2].axis("off")
    
    axes[1, 2].imshow(preprocessed_vol[:, :, prep_w//2], cmap="gray")
    axes[1, 2].set_title(f"Preprocessed - Sagittal (Slice {prep_w//2}/{prep_w-1})")
    axes[1, 2].axis("off")
    
    plt.tight_layout()
    plt.show()
    
    # Calculate voxel size change
    raw_voxel_volume = np.prod(raw_metadata['spacing'])
    prep_voxel_volume = np.prod(preprocessed_metadata['spacing'])
    print(f"\nVoxel Volume:")
    print(f"  Raw:         {raw_voxel_volume:.4f} mm³")
    print(f"  Preprocessed: {prep_voxel_volume:.4f} mm³")
    print(f"  Ratio:       {prep_voxel_volume / raw_voxel_volume:.4f}")

if preprocessed_volume is not None:
    visualize_resampling_effect(raw_volume, preprocessed_volume)
else:
    print("Cannot visualize: preprocessed volume not loaded.")


## 4. Verify Normalization

Check that HU normalization (clamping and scaling) was applied correctly.


In [None]:
def verify_normalization(raw_vol: np.ndarray, preprocessed_vol: np.ndarray, hu_min: float = -1000.0, hu_max: float = 1000.0):
    """Verify that normalization was applied correctly."""
    print("=" * 60)
    print("NORMALIZATION VERIFICATION")
    print("=" * 60)
    
    # Expected normalization: clamp to [hu_min, hu_max], then scale to [0, 1]
    print(f"\nExpected normalization:")
    print(f"  1. Clamp HU values to [{hu_min}, {hu_max}]")
    print(f"  2. Scale to [0, 1] range")
    
    print(f"\nRaw volume intensity range:")
    print(f"  Min: {raw_vol.min():.2f}")
    print(f"  Max: {raw_vol.max():.2f}")
    print(f"  Mean: {raw_vol.mean():.2f}")
    print(f"  Median: {np.median(raw_vol):.2f}")
    
    print(f"\nPreprocessed volume intensity range:")
    print(f"  Min: {preprocessed_vol.min():.2f}")
    print(f"  Max: {preprocessed_vol.max():.2f}")
    print(f"  Mean: {preprocessed_vol.mean():.2f}")
    print(f"  Median: {np.median(preprocessed_vol):.2f}")
    
    # Verify normalization
    is_normalized = (
        preprocessed_vol.min() >= 0.0 and 
        preprocessed_vol.max() <= 1.0
    )
    
    print(f"\nVerification:")
    if is_normalized:
        print(f"  [OK] Values are in [0, 1] range")
    else:
        print(f"  [WARNING] Values are NOT in [0, 1] range")
        print(f"           This may indicate an issue with normalization.")
    
    # Check if values were clamped
    raw_below_min = (raw_vol < hu_min).sum()
    raw_above_max = (raw_vol > hu_max).sum()
    
    print(f"\nClamping check:")
    print(f"  Raw values below {hu_min}: {raw_below_min} voxels ({raw_below_min/raw_vol.size*100:.2f}%)")
    print(f"  Raw values above {hu_max}: {raw_above_max} voxels ({raw_above_max/raw_vol.size*100:.2f}%)")
    
    # Histogram comparison
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(raw_vol.flatten(), bins=100, edgecolor="black", alpha=0.7, color="blue")
    axes[0].axvline(hu_min, color="red", linestyle="--", label=f"HU min ({hu_min})")
    axes[0].axvline(hu_max, color="red", linestyle="--", label=f"HU max ({hu_max})")
    axes[0].set_xlabel("Intensity Value (HU)")
    axes[0].set_ylabel("Frequency")
    axes[0].set_title("Raw Volume - Intensity Distribution")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].hist(preprocessed_vol.flatten(), bins=100, edgecolor="black", alpha=0.7, color="green")
    axes[1].axvline(0.0, color="red", linestyle="--", label="Min (0.0)")
    axes[1].axvline(1.0, color="red", linestyle="--", label="Max (1.0)")
    axes[1].set_xlabel("Intensity Value (Normalized)")
    axes[1].set_ylabel("Frequency")
    axes[1].set_title("Preprocessed Volume - Intensity Distribution")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("=" * 60)

if preprocessed_volume is not None:
    verify_normalization(raw_volume, preprocessed_volume)
else:
    print("Cannot verify normalization: preprocessed volume not loaded.")


In [None]:
def side_by_side_comparison(raw_vol: np.ndarray, preprocessed_vol: np.ndarray):
    """Create side-by-side comparison of raw and preprocessed volumes."""
    fig, axes = plt.subplots(3, 2, figsize=(12, 15))
    fig.suptitle("Side-by-Side Comparison: Raw vs Preprocessed", fontsize=16)
    
    raw_d, raw_h, raw_w = raw_vol.shape
    prep_d, prep_h, prep_w = preprocessed_vol.shape
    
    # Normalize raw volume for display (scale to [0, 1] for visualization)
    raw_display = (raw_vol - raw_vol.min()) / (raw_vol.max() - raw_vol.min() + 1e-8)
    
    planes = [
        ("Axial", raw_display[raw_d//2, :, :], preprocessed_vol[prep_d//2, :, :]),
        ("Coronal", raw_display[:, raw_h//2, :], preprocessed_vol[:, prep_h//2, :]),
        ("Sagittal", raw_display[:, :, raw_w//2], preprocessed_vol[:, :, prep_w//2]),
    ]
    
    for row, (plane_name, raw_slice, prep_slice) in enumerate(planes):
        # Raw volume
        im1 = axes[row, 0].imshow(raw_slice, cmap="gray")
        axes[row, 0].set_title(f"{plane_name} - Raw Volume")
        axes[row, 0].axis("off")
        plt.colorbar(im1, ax=axes[row, 0], fraction=0.046)
        
        # Preprocessed volume
        im2 = axes[row, 1].imshow(prep_slice, cmap="gray")
        axes[row, 1].set_title(f"{plane_name} - Preprocessed Volume")
        axes[row, 1].axis("off")
        plt.colorbar(im2, ax=axes[row, 1], fraction=0.046)
    
    plt.tight_layout()
    plt.show()

if preprocessed_volume is not None:
    side_by_side_comparison(raw_volume, preprocessed_volume)
else:
    print("Cannot create comparison: preprocessed volume not loaded.")


## 6. Test Preprocessing Pipeline

Manually test the preprocessing pipeline to verify each step.


In [None]:
# Test preprocessing steps manually
print("Testing preprocessing pipeline step by step...\n")

# Step 1: Resampling
print("Step 1: Resampling to isotropic spacing (1x1x1 mm)")
resampled, new_spacing = preprocessing.resample_isotropic(
    raw_volume,
    raw_metadata["spacing"],
    target_spacing=(1.0, 1.0, 1.0)
)
print(f"  Original shape: {raw_volume.shape} -> Resampled shape: {resampled.shape}")
print(f"  Original spacing: {raw_metadata['spacing']} -> New spacing: {new_spacing}")
print(f"  [OK] Resampling completed\n")

# Step 2: Auto-cropping
print("Step 2: Auto-cropping empty regions")
cropped, crop_slices = preprocessing.auto_crop(resampled, threshold=0.01)
print(f"  Resampled shape: {resampled.shape} -> Cropped shape: {cropped.shape}")
print(f"  Crop slices: {crop_slices}")
print(f"  [OK] Cropping completed\n")

# Step 3: Normalization
print("Step 3: HU normalization (clamp + scale)")
normalized = preprocessing.normalize_hu(cropped, hu_min=-1000.0, hu_max=1000.0)
print(f"  Before normalization - Min: {cropped.min():.2f}, Max: {cropped.max():.2f}")
print(f"  After normalization - Min: {normalized.min():.2f}, Max: {normalized.max():.2f}")
print(f"  [OK] Normalization completed\n")

# Compare with preprocessed volume
if preprocessed_volume is not None:
    print("Comparison with saved preprocessed volume:")
    shape_match = normalized.shape == preprocessed_volume.shape
    intensity_match = np.allclose(normalized, preprocessed_volume, atol=1e-5)
    
    print(f"  Shape match: {shape_match}")
    print(f"  Intensity match: {intensity_match}")
    
    if shape_match and intensity_match:
        print(f"  [OK] Manual preprocessing matches saved preprocessed volume!")
    else:
        print(f"  [WARNING] Differences detected:")
        if not shape_match:
            print(f"    - Shape: {normalized.shape} vs {preprocessed_volume.shape}")
        if not intensity_match:
            diff = np.abs(normalized - preprocessed_volume).max()
            print(f"    - Max intensity difference: {diff:.6f}")
else:
    print("  Cannot compare: preprocessed volume not loaded.")


## Summary

This notebook validated the preprocessing pipeline by:

1. **Loading volumes**: Successfully loaded raw and preprocessed volumes
2. **Comparison**: Compared shape, spacing, intensity ranges, and memory usage
3. **Resampling visualization**: Visualized how resampling changes volume dimensions
4. **Normalization verification**: Confirmed that HU values are correctly clamped and scaled to [0, 1]
5. **Side-by-side comparison**: Direct visual comparison of raw vs preprocessed
6. **Pipeline testing**: Manually tested each preprocessing step

### Key Findings:
- Resampling converts volumes to isotropic spacing (1x1x1 mm)
- Auto-cropping removes empty regions to reduce memory usage
- Normalization clamps HU values to [-1000, 1000] and scales to [0, 1]
- Preprocessing maintains anatomical structure while optimizing for training
