In [14]:
import kagglehub
path = kagglehub.dataset_download("tarunteja09/sunnybrook-cardiac-sturctured-2d")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tarunteja09/sunnybrook-cardiac-sturctured-2d?dataset_version_number=1...


100%|██████████| 2.53G/2.53G [00:34<00:00, 79.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tarunteja09/sunnybrook-cardiac-sturctured-2d/versions/1


In [15]:
path = kagglehub.dataset_download("orvile/carotid-ultrasound-images")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/orvile/carotid-ultrasound-images?dataset_version_number=2...


100%|██████████| 284M/284M [00:01<00:00, 153MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/orvile/carotid-ultrasound-images/versions/2


In [17]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import glob

# Define dataset paths (Standard Kaggle paths)
dataset1_path = '/root/.cache/kagglehub/datasets/tarunteja09/sunnybrook-cardiac-sturctured-2d/versions/1'
dataset2_path = '/root/.cache/kagglehub/datasets/orvile/carotid-ultrasound-images/versions/2'

def explore_dataset(name, root_path):
    print(f"--- Exploring {name} ---")
    all_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            all_files.append(os.path.join(root, file))

    # 1. Total Count
    total_files = len(all_files)
    image_extensions = ('.png', '.jpg', '.jpeg', '.dcm', '.bmp')
    images = [f for f in all_files if f.lower().endswith(image_extensions)]
    print(f"Total files: {total_files}")
    print(f"Total image files: {len(images)}")

    # 2. Structure / Classes
    # In medical datasets, classes are often folder names or prefixes in filenames
    subdirs = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]
    print(f"Subdirectories/Classes found: {subdirs}")

# Run Exploration
explore_dataset("Carotid Ultrasound", dataset1_path)
explore_dataset("Sunnybrook Cardiac 2D", dataset2_path)

# Custom analysis for Imbalance (Carotid)
# The Carotid dataset is split into 'US images' and 'Expert mask images'
us_path = glob.glob(f"{dataset1_path}/**/US images/*.png", recursive=True)
mask_path = glob.glob(f"{dataset1_path}/**/Expert mask images/*.png", recursive=True)
print(f"Carotid - US: {len(us_path)}, Masks: {len(mask_path)}")

--- Exploring Carotid Ultrasound ---
Total files: 53072
Total image files: 51545
Subdirectories/Classes found: ['extracted']
--- Exploring Sunnybrook Cardiac 2D ---
Total files: 2201
Total image files: 2200
Subdirectories/Classes found: ['Common Carotid Artery Ultrasound Images']
Carotid - US: 0, Masks: 0


In [19]:

from pathlib import Path

def analyze_dataset(dataset_name, root_path, pathology_mapping=None):
    """
    Analyzes a medical imaging dataset and prints a detailed report.
    """
    print(f"{'='*60}")
    print(f"ANALYSIS FOR: {dataset_name}")
    print(f"{'='*60}")

    root = Path(root_path)
    if not root.exists():
        print(f"Error: Path {root_path} not found.")
        return

    # 1. Type of Imaging Data
    # Heuristic: Check common medical formats or metadata clues
    img_extensions = ('.png', '.jpg', '.jpeg', '.dcm', '.tif')
    all_files = list(root.rglob('*'))
    images = [f for f in all_files if f.suffix.lower() in img_extensions]

    if "carotid" in dataset_name.lower():
        img_type = "Ultrasound (B-mode) - Carotid Artery"
    elif "cardiac" in dataset_name.lower() or "sunnybrook" in dataset_name.lower():
        img_type = "Cardiac MRI (Cine-MRI 2D Slices)"
    else:
        img_type = "Determined from files: " + (images[0].suffix if images else "Unknown")

    # 2. Number of Images
    total_images = len(images)

    # 3. Available Classes/Labels & 4. Dataset Imbalance
    class_counts = {}

    # Check if classes are folder-based
    subdirs = [d for d in root.iterdir() if d.is_dir()]

    # Logic for Sunnybrook (Pathology based folders)
    if "cardiac" in dataset_name.lower():
        # Typically structured as TRAIN/VAL -> Pathology
        for img in images:
            pathology = img.parent.name
            class_counts[pathology] = class_counts.get(pathology, 0) + 1

    # Logic for Carotid (Subject based + Image/Mask)
    elif "carotid" in dataset_name.lower():
        masks = [f for f in images if "mask" in str(f).lower()]
        raw_images = [f for f in images if "mask" not in str(f).lower()]
        class_counts = {"Raw Ultrasound": len(raw_images), "Expert Masks": len(masks)}

    # Output details
    print(f"● Type of imaging data: {img_type}")
    print(f"● Number of images: {total_images}")
    print(f"● Available classes or labels:")
    for cls, count in class_counts.items():
        print(f"   - {cls}: {count} images")

    # Calculate Imbalance
    if class_counts:
        max_c = max(class_counts.values())
        min_c = min(class_counts.values())
        if min_c == 0:
            print(f"● Dataset imbalance: Cannot calculate ratio due to classes with 0 images.")
        else:
            imbalance_ratio = round(max_c / min_c, 2)
            print(f"● Dataset imbalance: Ratio {imbalance_ratio}:1 (Max class vs Min class)")
    else:
        print(f"● Dataset imbalance: Not applicable (single class or regression)")

    # 5. Observed Challenges (Automated & Contextual)
    print(f"● Challenges observed:")

    # Challenge: Small patient pool (Carotid)
    if "carotid" in dataset_name.lower():
        subjects = set([f.parent.parent.name for f in images if f.parent.name == "US images"])
        print(f"   - Data Scope: Limited subject diversity ({len(subjects)} subjects). High risk of overfitting.")
        print(f"   - Noise: Significant speckle noise inherent to B-mode ultrasound.")

    # Challenge: Image Consistency (Sunnybrook)
    if images:
        try:
            with Image.open(images[0]) as img:
                res = img.size
            inconsistent = any(Image.open(i).size != res for i in images[:20])
            if inconsistent:
                print(f"   - Pre-processing: Inconsistent image resolutions detected across slices.")
        except:
            pass

    if "cardiac" in dataset_name.lower():
        print(f"   - Quality: 2D extraction from Cine-MRI may include motion artifacts or blurred frames.")
        print(f"   - Annotation: Contours are often older (2009 standards); may differ from modern LV segmentation guidelines.")

    print("\n")

dataset1_path = '/root/.cache/kagglehub/datasets/tarunteja09/sunnybrook-cardiac-sturctured-2d/versions/1'
dataset2_path = '/root/.cache/kagglehub/datasets/orvile/carotid-ultrasound-images/versions/2'

analyze_dataset("Sunnybrook Cardiac Structured 2D", dataset1_path) # Corrected: Sunnybrook name with Sunnybrook path
analyze_dataset("Carotid Ultrasound Images", dataset2_path)       # Corrected: Carotid name with Carotid path

ANALYSIS FOR: Sunnybrook Cardiac Structured 2D
● Type of imaging data: Cardiac MRI (Cine-MRI 2D Slices)
● Number of images: 51545
● Available classes or labels:
   - CINESAX_301: 680 images
   - CINELAX_307: 60 images
   - CINEAORTA_308: 100 images
   - unnamed_356: 81 images
   - unnamed_313: 30 images
   - CINESAX_303: 640 images
   - CINESAX_300: 2100 images
   - unnamed_354: 590 images
   - unnamed_310: 27 images
   - unnamed_351: 597 images
   - unnamed_355: 487 images
   - unnamed_314: 25 images
   - PERFLAX_309: 210 images
   - CINELAX_305: 240 images
   - unnamed_311: 38 images
   - unnamed_312: 36 images
   - unnamed_350: 609 images
   - unnamed_352: 597 images
   - CINELAX_306: 260 images
   - CINELAX_304: 680 images
   - unnamed_2: 3232 images
   - Localizers_1: 741 images
   - unnamed_353: 586 images
   - unnamed_409: 7 images
   - unnamed_412: 7 images
   - unnamed_450: 109 images
   - unnamed_420: 2 images
   - unnamed_417: 8 images
   - CINESAX_402: 300 images
   - unnam