## # 4D-Lung Dataset Explorer
# 
 Interactive exploration tool for 4D-Lung collection to:
 - Understand temporal structure and breathing phases
 - Validate smooth temporal transitions
 - Visualize breathing cycles as videos
 - Assess data quality for flow matching training

In [1]:
import os
import numpy as np
import pandas as pd
import ipywidgets as w
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.patches import Rectangle
import pydicom
from pathlib import Path
from tqdm import tqdm
import json
from datetime import datetime

plt.ioff()  # avoid extra figures popping up

<contextlib.ExitStack at 0x7f3f2e347890>

In [2]:
RAW_DATA_DIR = Path("/mnt/tcia_data/raw/4D-Lung")
RESULTS_DIR = Path("/mnt/tcia_data/processed/4D-Lung")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# Data Discovery Functions
def discover_4d_structure(data_dir):
    """Discover the structure of 4D-Lung dataset"""
    structure = []
    
    if not data_dir.exists():
        print(f"Data directory not found: {data_dir}")
        return pd.DataFrame()
    
    for patient_dir in sorted(data_dir.iterdir()):
        if not patient_dir.is_dir():
            continue
            
        patient_id = patient_dir.name
        
        for study_dir in patient_dir.iterdir():
            if not study_dir.is_dir():
                continue
                
            study_desc = study_dir.name
            
            for series_dir in study_dir.iterdir():
                if not series_dir.is_dir():
                    continue
                    
                series_desc = series_dir.name
                
                # Count DICOM files
                dicom_files = list(series_dir.glob("*.dcm"))
                
                if dicom_files:
                    try:
                        # Read sample DICOM to get metadata
                        sample_dcm = pydicom.dcmread(dicom_files[0])
                        
                        structure.append({
                            'patient_id': patient_id,
                            'study_desc': study_desc,
                            'series_desc': series_desc,
                            'series_path': str(series_dir),
                            'n_slices': len(dicom_files),
                            'modality': getattr(sample_dcm, 'Modality', 'Unknown'),
                            'series_time': getattr(sample_dcm, 'SeriesTime', 'Unknown'),
                            'acquisition_time': getattr(sample_dcm, 'AcquisitionTime', 'Unknown'),
                            'slice_thickness': getattr(sample_dcm, 'SliceThickness', 'Unknown'),
                            'pixel_spacing': getattr(sample_dcm, 'PixelSpacing', 'Unknown'),
                            'rows': getattr(sample_dcm, 'Rows', 'Unknown'),
                            'columns': getattr(sample_dcm, 'Columns', 'Unknown'),
                            'respiratory_motion_phase': getattr(sample_dcm, 'RespiratoryMotionPhase', 'Unknown'),
                            'temporal_position': getattr(sample_dcm, 'TemporalPosition', 'Unknown'),
                            'trigger_time': getattr(sample_dcm, 'TriggerTime', 'Unknown'),
                        })
                        
                    except Exception as e:
                        print(f"Error reading {series_dir}: {e}")
                        structure.append({
                            'patient_id': patient_id,
                            'study_desc': study_desc,
                            'series_desc': series_desc,
                            'series_path': str(series_dir),
                            'n_slices': len(dicom_files),
                            'modality': 'ERROR',
                            'error': str(e)
                        })
    
    return pd.DataFrame(structure)

In [5]:
# Load and explore data structure
print("Discovering 4D-Lung dataset structure...")
dataset_df = discover_4d_structure(RAW_DATA_DIR)

if len(dataset_df) == 0:
    print("No data found! Please run data_download.ipynb first.")
else:
    print(f"Found {len(dataset_df)} series across {dataset_df['patient_id'].nunique()} patients")
    
    # Save structure
    structure_path = RESULTS_DIR / "dataset_structure.csv"
    dataset_df.to_csv(structure_path, index=False)
    print(f"Dataset structure saved to: {structure_path}")

Discovering 4D-Lung dataset structure...
Found 551 series across 3 patients
Dataset structure saved to: /mnt/tcia_data/processed/4D-Lung/dataset_structure.csv


In [6]:
# Analyze dataset structure
if len(dataset_df) > 0:
    print("\n=== Dataset Structure Analysis ===")
    
    # Patient summary
    patient_summary = dataset_df.groupby('patient_id').agg({
        'series_desc': 'count',
        'modality': lambda x: list(x.unique()),
        'n_slices': ['min', 'max', 'mean'],
        'respiratory_motion_phase': lambda x: list(x.unique()),
        'temporal_position': lambda x: list(x.unique())
    }).round(2)
    
    print(f"Patients: {len(patient_summary)}")
    print(f"Series per patient: {dataset_df.groupby('patient_id').size().describe()}")
    print(f"Modalities: {dataset_df['modality'].unique()}")
    print(f"Respiratory phases available: {dataset_df['respiratory_motion_phase'].unique()}")
    
    # Check for 4D characteristics
    has_respiratory_info = dataset_df['respiratory_motion_phase'] != 'Unknown'
    has_temporal_info = dataset_df['temporal_position'] != 'Unknown'
    
    print(f"\nTemporal Information:")
    print(f"- Series with respiratory phase info: {has_respiratory_info.sum()}/{len(dataset_df)}")
    print(f"- Series with temporal position info: {has_temporal_info.sum()}/{len(dataset_df)}")
    
    display(patient_summary.head())



=== Dataset Structure Analysis ===
Patients: 3
Series per patient: count      3.000000
mean     183.666667
std      145.362765
min       81.000000
25%      100.500000
50%      120.000000
75%      235.000000
max      350.000000
dtype: float64
Modalities: ['CT' 'RTSTRUCT']
Respiratory phases available: ['Unknown']

Temporal Information:
- Series with respiratory phase info: 0/551
- Series with temporal position info: 0/551


Unnamed: 0_level_0,series_desc,modality,n_slices,n_slices,n_slices,respiratory_motion_phase,temporal_position
Unnamed: 0_level_1,count,<lambda>,min,max,mean,<lambda>,<lambda>
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
100_HM10395,350,"[CT, RTSTRUCT]",1,142,51.23,[Unknown],[Unknown]
101_HM10395,120,"[CT, RTSTRUCT]",1,149,54.17,[Unknown],[Unknown]
102_HM10395,81,[CT],50,50,50.0,[Unknown],[Unknown]


In [7]:
# Temporal Sequence Analysis Functions
def load_temporal_sequence(patient_id, study_desc, target_modality='CT'):
    """Load temporal sequence for a patient"""
    patient_series = dataset_df[
        (dataset_df['patient_id'] == patient_id) & 
        (dataset_df['study_desc'] == study_desc) &
        (dataset_df['modality'] == target_modality)
    ].copy()
    
    if len(patient_series) == 0:
        return None, "No CT series found"
    
    # Sort by temporal indicators
    sort_cols = []
    if 'temporal_position' in patient_series.columns and patient_series['temporal_position'].iloc[0] != 'Unknown':
        sort_cols.append('temporal_position')
    if 'respiratory_motion_phase' in patient_series.columns and patient_series['respiratory_motion_phase'].iloc[0] != 'Unknown':
        sort_cols.append('respiratory_motion_phase')
    if 'series_time' in patient_series.columns and patient_series['series_time'].iloc[0] != 'Unknown':
        sort_cols.append('series_time')
    
    if sort_cols:
        patient_series = patient_series.sort_values(sort_cols)
    
    return patient_series, "OK"

def load_slice_from_series(series_path, slice_index=None):
    """Load a specific slice from a series"""
    try:
        series_path = Path(series_path)
        dicom_files = sorted(series_path.glob("*.dcm"))
        
        if not dicom_files:
            return None, "No DICOM files found"
        
        # Use middle slice if not specified
        if slice_index is None:
            slice_index = len(dicom_files) // 2
        
        slice_index = max(0, min(slice_index, len(dicom_files) - 1))
        
        dcm = pydicom.dcmread(dicom_files[slice_index])
        return dcm.pixel_array, "OK"
    
    except Exception as e:
        return None, f"Error loading slice: {e}"

def analyze_temporal_continuity(patient_series):
    """Analyze temporal continuity of breathing sequence"""
    if len(patient_series) < 2:
        return {"error": "Need at least 2 phases for continuity analysis"}
    
    continuity_metrics = []
    
    for i in range(len(patient_series) - 1):
        curr_series = patient_series.iloc[i]
        next_series = patient_series.iloc[i + 1]
        
        # Load center slices
        curr_img, curr_status = load_slice_from_series(curr_series['series_path'])
        next_img, next_status = load_slice_from_series(next_series['series_path'])
        
        if curr_img is not None and next_img is not None:
            # Normalize images
            curr_norm = (curr_img - curr_img.min()) / (curr_img.max() - curr_img.min())
            next_norm = (next_img - next_img.min()) / (next_img.max() - next_img.min())
            
            # Calculate similarity metrics
            mse = np.mean((curr_norm - next_norm) ** 2)
            correlation = np.corrcoef(curr_norm.flatten(), next_norm.flatten())[0, 1]
            
            continuity_metrics.append({
                'phase_from': curr_series.get('respiratory_motion_phase', i),
                'phase_to': next_series.get('respiratory_motion_phase', i+1),
                'mse': mse,
                'correlation': correlation,
                'intensity_diff': abs(curr_img.mean() - next_img.mean()) / max(curr_img.mean(), next_img.mean())
            })
    
    return continuity_metrics

In [None]:
# Interactive Widgets
patients = sorted(dataset_df['patient_id'].unique()) if len(dataset_df) > 0 else []
studies_by_patient = dataset_df.groupby('patient_id')['study_desc'].apply(list).to_dict() if len(dataset_df) > 0 else {}

# Patient selection
patient_dd = w.Dropdown(
    options=patients,
    description="Patient:",
    layout=w.Layout(width="200px"),
)

# Study selection (will be updated based on patient)
study_dd = w.Dropdown(
    options=[],
    description="Study:",
    layout=w.Layout(width="300px"),
)

# Slice selection
slice_slider = w.IntSlider(
    value=0,
    min=0,
    max=100,
    step=1,
    description="Slice:",
    layout=w.Layout(width="300px"),
)

# Analysis buttons
analyze_btn = w.Button(description="Analyze Temporal Sequence", button_style="primary")
video_btn = w.Button(description="Create Breathing Video", button_style="success")
continuity_btn = w.Button(description="Check Continuity", button_style="warning")

# Output areas
sequence_out = w.Output()
video_out = w.Output()
continuity_out = w.Output()

In [9]:
# Widget Callbacks
def update_studies(change):
    """Update study dropdown based on selected patient"""
    patient_id = change['new']
    if patient_id in studies_by_patient:
        study_dd.options = studies_by_patient[patient_id]
        study_dd.value = studies_by_patient[patient_id][0] if studies_by_patient[patient_id] else None
    else:
        study_dd.options = []

def analyze_sequence(_):
    """Analyze temporal sequence for selected patient/study"""
    sequence_out.clear_output()
    
    with sequence_out:
        if not patient_dd.value or not study_dd.value:
            print("Please select patient and study")
            return
        
        patient_series, status = load_temporal_sequence(patient_dd.value, study_dd.value)
        
        if patient_series is None:
            print(f"Error: {status}")
            return
        
        print(f"=== Temporal Sequence Analysis ===")
        print(f"Patient: {patient_dd.value}")
        print(f"Study: {study_dd.value}")
        print(f"Number of phases: {len(patient_series)}")
        
        # Display sequence info
        sequence_info = patient_series[['series_desc', 'n_slices', 'respiratory_motion_phase', 'temporal_position']].copy()
        sequence_info['phase_index'] = range(len(sequence_info))
        
        display(sequence_info)
        
        # Create thumbnail grid
        fig, axes = plt.subplots(2, min(5, len(patient_series)), figsize=(15, 6))
        if len(patient_series) == 1:
            axes = np.array([axes])
        axes = axes.flatten()
        
        for i, (_, series) in enumerate(patient_series.iterrows()):
            if i >= 10:  # Limit to first 10 phases
                break
                
            img, status = load_slice_from_series(series['series_path'])
            
            if img is not None:
                axes[i].imshow(img, cmap='gray')
                axes[i].set_title(f"Phase {i}\n{series.get('respiratory_motion_phase', 'Unknown')}")
                axes[i].axis('off')
            else:
                axes[i].text(0.5, 0.5, f"Error\n{status}", ha='center', va='center')
                axes[i].axis('off')
        
        # Hide unused subplots
        for i in range(len(patient_series), len(axes)):
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.show()

def create_breathing_video(_):
    """Create breathing cycle video"""
    video_out.clear_output()
    
    with video_out:
        if not patient_dd.value or not study_dd.value:
            print("Please select patient and study")
            return
        
        patient_series, status = load_temporal_sequence(patient_dd.value, study_dd.value)
        
        if patient_series is None:
            print(f"Error: {status}")
            return
        
        print("Creating breathing cycle video...")
        
        # Load all phases
        phases = []
        for _, series in patient_series.iterrows():
            img, status = load_slice_from_series(series['series_path'], slice_slider.value)
            if img is not None:
                phases.append(img)
        
        if len(phases) < 2:
            print("Need at least 2 phases for video")
            return
        
        # Create animation
        fig, ax = plt.subplots(figsize=(8, 8))
        im = ax.imshow(phases[0], cmap='gray')
        ax.axis('off')
        title = ax.set_title(f"Breathing Cycle - Phase 0")
        
        def animate(frame):
            im.set_array(phases[frame % len(phases)])
            title.set_text(f"Breathing Cycle - Phase {frame % len(phases)}")
            return [im, title]
        
        anim = animation.FuncAnimation(fig, animate, frames=len(phases)*2, 
                                     interval=500, blit=True, repeat=True)
        
        plt.show()
        
        # Save video if requested
        video_path = RESULTS_DIR / f"{patient_dd.value}_breathing_cycle.gif"
        anim.save(video_path, writer='pillow', fps=2)
        print(f"Video saved to: {video_path}")

def check_continuity(_):
    """Check temporal continuity"""
    continuity_out.clear_output()
    
    with continuity_out:
        if not patient_dd.value or not study_dd.value:
            print("Please select patient and study")
            return
        
        patient_series, status = load_temporal_sequence(patient_dd.value, study_dd.value)
        
        if patient_series is None:
            print(f"Error: {status}")
            return
        
        print("Analyzing temporal continuity...")
        
        continuity_metrics = analyze_temporal_continuity(patient_series)
        
        if 'error' in continuity_metrics:
            print(f"Error: {continuity_metrics['error']}")
            return
        
        # Display metrics
        metrics_df = pd.DataFrame(continuity_metrics)
        
        print(f"Temporal Continuity Analysis:")
        print(f"Mean MSE: {metrics_df['mse'].mean():.4f}")
        print(f"Mean Correlation: {metrics_df['correlation'].mean():.4f}")
        print(f"Mean Intensity Difference: {metrics_df['intensity_diff'].mean():.4f}")
        
        display(metrics_df)
        
        # Plot continuity metrics
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        axes[0].plot(metrics_df['mse'], 'o-')
        axes[0].set_title('MSE Between Consecutive Phases')
        axes[0].set_xlabel('Phase Transition')
        axes[0].set_ylabel('MSE')
        
        axes[1].plot(metrics_df['correlation'], 'o-')
        axes[1].set_title('Correlation Between Consecutive Phases')
        axes[1].set_xlabel('Phase Transition')
        axes[1].set_ylabel('Correlation')
        
        axes[2].plot(metrics_df['intensity_diff'], 'o-')
        axes[2].set_title('Intensity Difference Between Consecutive Phases')
        axes[2].set_xlabel('Phase Transition')
        axes[2].set_ylabel('Relative Intensity Difference')
        
        plt.tight_layout()
        plt.show()

# Connect callbacks
patient_dd.observe(update_studies, names='value')
analyze_btn.on_click(analyze_sequence)
video_btn.on_click(create_breathing_video)
continuity_btn.on_click(check_continuity)

In [12]:
# UI Layout
controls = w.VBox([
    w.HTML("<h3>4D-Lung Dataset Explorer</h3>"),
    w.HBox([patient_dd, study_dd]),
    w.HBox([slice_slider]),
    w.HBox([analyze_btn, video_btn, continuity_btn]),
])

tabs = w.Tab()
tabs.children = [sequence_out, video_out, continuity_out]
tabs.set_title(0, "Sequence Analysis")
tabs.set_title(1, "Breathing Video")
tabs.set_title(2, "Continuity Check")

ui = w.VBox([controls, tabs])

# Initialize
if len(patients) > 0:
    update_studies({'new': patients[0]})

display(ui)

VBox(children=(VBox(children=(HTML(value='<h3>4D-Lung Dataset Explorer</h3>'), HBox(children=(Dropdown(descrip…

In [15]:
# Data Export Functions
def export_dataset_summary():
    """Export dataset summary for further analysis"""
    if len(dataset_df) == 0:
        print("No data to export")
        return

    # Create summary report
    summary = {
        'dataset': '4D-Lung',
        'analysis_date': datetime.now().isoformat(),
        'total_patients': int(dataset_df['patient_id'].nunique()),
        'total_series': int(len(dataset_df)),
        'modalities': dataset_df['modality'].unique().tolist(),
        'patients': {}
    }

    for patient_id in dataset_df['patient_id'].unique():
        patient_data = dataset_df[dataset_df['patient_id'] == patient_id]
        
        summary['patients'][patient_id] = {
            'studies': patient_data['study_desc'].unique().tolist(),
            'series_count': int(len(patient_data)),
            'modalities': patient_data['modality'].unique().tolist(),
            'has_respiratory_info': bool((patient_data['respiratory_motion_phase'] != 'Unknown').any()),
            'has_temporal_info': bool((patient_data['temporal_position'] != 'Unknown').any()),
            'slice_range': [int(patient_data['n_slices'].min()), int(patient_data['n_slices'].max())],
        }

    # Save summary
    summary_path = RESULTS_DIR / "dataset_summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"Dataset summary exported to: {summary_path}")

    # Export button
    export_btn = w.Button(description="Export Dataset Summary", button_style="info")
    export_btn.on_click(lambda _: export_dataset_summary())

    display(export_btn)

# %%

In [16]:
export_dataset_summary()

Dataset summary exported to: /mnt/tcia_data/processed/4D-Lung/dataset_summary.json


Button(button_style='info', description='Export Dataset Summary', style=ButtonStyle())

Button(button_style='info', description='Export Dataset Summary', style=ButtonStyle())