# Duke Breast Cancer MRI Dataset Explorer

This notebook helps explore and analyze the Duke Breast Cancer MRI dataset with the following structure:
```
root_dir/
├── Breast_MRI_001/
│   └── patient_directory/
│       ├── dynamic_sequence_1/
│       │   └── *.dcm files
│       ├── dynamic_sequence_2/
│       │   └── *.dcm files
│       └── ...
├── Breast_MRI_002/
└── ...
```

We'll use Python libraries for DICOM file processing and visualization.

## 1. Install Required Libraries

In [1]:
# Install necessary packages
!pip install pydicom matplotlib numpy pandas seaborn tqdm pillow

Collecting pydicom
  Using cached pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (101 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x8

## 2. Import Libraries

In [2]:
import os
import pydicom
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from PIL import Image
from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings('ignore')

## 3. Dataset Detection Functions

In [3]:
def detect_dataset_structure(root_dir):
    """
    Explore the dataset structure and return summary statistics.
    """
    print(f"Analyzing dataset structure in: {root_dir}")
    
    # Dictionary to store structure information
    dataset_info = {
        'total_patients': 0,
        'total_studies': 0,
        'total_sequences': 0,
        'total_dcm_files': 0,
        'patient_ids': [],
        'patient_details': {}
    }
    
    # Check if root directory exists
    if not os.path.exists(root_dir):
        print(f"Error: The directory {root_dir} does not exist.")
        return dataset_info
    
    # Loop through first level directories (Breast_MRI_XXX)
    for patient_folder in sorted(os.listdir(root_dir)):
        patient_path = os.path.join(root_dir, patient_folder)
        
        if not os.path.isdir(patient_path) or not patient_folder.startswith('Breast_MRI_'):
            continue
            
        dataset_info['total_patients'] += 1
        dataset_info['patient_ids'].append(patient_folder)
        
        patient_info = {
            'patient_directory': None,
            'sequences': [],
            'sequence_counts': {},
            'total_files': 0
        }
        
        # Find patient_directory within Breast_MRI_XXX
        patient_subdirs = [d for d in os.listdir(patient_path) if os.path.isdir(os.path.join(patient_path, d))]
        
        if len(patient_subdirs) > 0:
            patient_directory = patient_subdirs[0]  # Assuming there's only one directory per patient
            patient_info['patient_directory'] = patient_directory
            dataset_info['total_studies'] += 1
            
            patient_dir_path = os.path.join(patient_path, patient_directory)
            
            # Loop through sequences
            for sequence_folder in sorted(os.listdir(patient_dir_path)):
                sequence_path = os.path.join(patient_dir_path, sequence_folder)
                
                if os.path.isdir(sequence_path):
                    dataset_info['total_sequences'] += 1
                    patient_info['sequences'].append(sequence_folder)
                    
                    # Count DICOM files
                    dcm_files = [f for f in os.listdir(sequence_path) if f.endswith('.dcm')]
                    file_count = len(dcm_files)
                    patient_info['sequence_counts'][sequence_folder] = file_count
                    patient_info['total_files'] += file_count
                    dataset_info['total_dcm_files'] += file_count
        
        dataset_info['patient_details'][patient_folder] = patient_info
    
    return dataset_info

In [4]:
def display_dataset_summary(dataset_info):
    """
    Display summary statistics of the dataset.
    """
    print("\n===== DATASET SUMMARY =====")
    print(f"Total patients (Breast_MRI_XXX folders): {dataset_info['total_patients']}")
    print(f"Total studies (patient directories): {dataset_info['total_studies']}")
    print(f"Total sequences: {dataset_info['total_sequences']}")
    print(f"Total DICOM files: {dataset_info['total_dcm_files']}")
    
    if dataset_info['total_patients'] > 0:
        avg_sequences = dataset_info['total_sequences'] / dataset_info['total_patients']
        avg_files = dataset_info['total_dcm_files'] / dataset_info['total_patients']
        print(f"Average sequences per patient: {avg_sequences:.2f}")
        print(f"Average DICOM files per patient: {avg_files:.2f}")
        
        # Collect sequence names across patients
        all_sequences = []
        for patient_id, details in dataset_info['patient_details'].items():
            all_sequences.extend(details['sequences'])
            
        sequence_counts = Counter(all_sequences)
        print("\nMost common sequence names:")
        for seq, count in sequence_counts.most_common(10):
            print(f"  - {seq}: {count} occurrences")
            
        # Sample of patients
        print("\nSample of patient IDs:")
        for patient_id in sorted(dataset_info['patient_ids'])[:5]:
            print(f"  - {patient_id}")
        if len(dataset_info['patient_ids']) > 5:
            print(f"  - ... and {len(dataset_info['patient_ids']) - 5} more")

## 4. Detect the Duke Breast Cancer MRI Dataset

In [6]:
# Set the root directory path
root_dir = "../data/Duke-Breast-Cancer-MRI"  # Change this to your actual path

# Detect dataset structure
dataset_info = detect_dataset_structure(root_dir)

# Display summary
display_dataset_summary(dataset_info)

Analyzing dataset structure in: ../data/Duke-Breast-Cancer-MRI

===== DATASET SUMMARY =====
Total patients (Breast_MRI_XXX folders): 1
Total studies (patient directories): 1
Total sequences: 6
Total DICOM files: 842
Average sequences per patient: 6.00
Average DICOM files per patient: 842.00

Most common sequence names:
  - 3.000000-ax t1-75455: 1 occurrences
  - 600.000000-ax 3d dyn MP-31458: 1 occurrences
  - 601.000000-Ph1ax 3d dyn MP-61179: 1 occurrences
  - 602.000000-Ph2ax 3d dyn MP-76388: 1 occurrences
  - 603.000000-Ph3ax 3d dyn MP-16301: 1 occurrences
  - 604.000000-Ph4ax 3d dyn MP-57837: 1 occurrences

Sample of patient IDs:
  - Breast_MRI_358


## 5. Analyze DICOM Metadata

In [7]:
def sample_dicom_metadata(dataset_info, root_dir, sample_size=5):
    """
    Sample DICOM files from different patients and extract metadata.
    """
    metadata_samples = []
    sample_count = 0
    
    # Try to get samples from different patients
    for patient_id in dataset_info['patient_ids']:
        if sample_count >= sample_size:
            break
            
        patient_details = dataset_info['patient_details'][patient_id]
        patient_dir = os.path.join(root_dir, patient_id, patient_details['patient_directory'])
        
        # Try each sequence
        for sequence in patient_details['sequences']:
            if sample_count >= sample_size:
                break
                
            sequence_path = os.path.join(patient_dir, sequence)
            dcm_files = [f for f in os.listdir(sequence_path) if f.endswith('.dcm')]
            
            if dcm_files:
                # Get the first DICOM file
                dicom_path = os.path.join(sequence_path, dcm_files[0])
                try:
                    dcm = pydicom.dcmread(dicom_path)
                    metadata_samples.append({
                        'patient_id': patient_id,
                        'sequence': sequence,
                        'file': dcm_files[0],
                        'dicom': dcm
                    })
                    sample_count += 1
                except Exception as e:
                    print(f"Error reading DICOM file {dicom_path}: {e}")
    
    return metadata_samples

In [8]:
def display_dicom_metadata(metadata_samples):
    """
    Display metadata from sampled DICOM files.
    """
    common_tags = ['PatientName', 'PatientID', 'PatientBirthDate', 'PatientSex',
                   'StudyDescription', 'SeriesDescription', 'Modality',
                   'Manufacturer', 'ManufacturerModelName', 'MagneticFieldStrength',
                   'PixelSpacing', 'SliceThickness', 'RepetitionTime', 'EchoTime']
    
    for i, sample in enumerate(metadata_samples, 1):
        dcm = sample['dicom']
        print(f"\n===== DICOM Sample {i} =====")
        print(f"Patient ID: {sample['patient_id']}")
        print(f"Sequence: {sample['sequence']}")
        print(f"File: {sample['file']}")
        print("\nKey Metadata:")
        
        for tag in common_tags:
            if hasattr(dcm, tag):
                print(f"  - {tag}: {getattr(dcm, tag)}")
        
        print(f"\nImage dimensions: {dcm.Rows} x {dcm.Columns}")
        if hasattr(dcm, 'NumberOfFrames'):
            print(f"Number of frames: {dcm.NumberOfFrames}")
        
        # Add more MRI-specific tags
        mri_tags = ['ScanningSequence', 'SequenceVariant', 'ScanOptions', 'ContrastBolusAgent']
        print("\nMRI-specific tags:")
        for tag in mri_tags:
            if hasattr(dcm, tag):
                print(f"  - {tag}: {getattr(dcm, tag)}")