In [15]:
# filepath: [计数.ipynb](http://_vscodecontentref_/0)
import os
from pathlib import Path

# Dmass根目录
dmass_root = "d:/Dataset/New_CBDDSM/Dmass"

# 定义要检查的子目录结构
subfolders = [
    ("train", "images", "benign"),
    ("train", "images", "malignant"),
    ("train", "masks", "benign"),
    ("train", "masks", "malignant"),
    ("test", "images", "benign"),
    ("test", "images", "malignant"),
    ("test", "masks", "benign"),
    ("test", "masks", "malignant"),
    ("val", "images", "benign"),
    ("val", "images", "malignant"),
    ("val", "masks", "benign"),
    ("val", "masks", "malignant"),
]

# 存储结果的字典
counts = {}

# 统计每个文件夹中的图片数量
for subfolder_parts in subfolders:
    # 构建完整路径
    folder_path = os.path.join(dmass_root, *subfolder_parts)
    
    # 检查文件夹是否存在
    if os.path.exists(folder_path):
        # 计算图片文件数量 (.png, .jpg, .jpeg)
        image_files = [f for f in os.listdir(folder_path) 
                      if os.path.isfile(os.path.join(folder_path, f)) and 
                      f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        count = len(image_files)
    else:
        count = 0  # 文件夹不存在
    
    # 存储结果
    folder_name = "/".join(subfolder_parts)
    counts[folder_name] = count

# 输出结果
print("Dmass数据集文件计数：")
print("-" * 50)

# 按类别分组显示
datasets = ["train", "test", "val"]
types = ["images", "masks"]
classes = ["benign", "malignant"]

total_images = 0

for dataset in datasets:
    print(f"\n{dataset.upper()} 集合:")
    dataset_total = 0
    
    for img_type in types:
        print(f"  {img_type}:")
        type_total = 0
        
        for cls in classes:
            key = f"{dataset}/{img_type}/{cls}"
            count = counts.get(key, 0)
            type_total += count
            print(f"    - {cls}: {count} 个文件")
            
        print(f"    小计: {type_total} 个文件")
        dataset_total += type_total
    
    print(f"  {dataset} 总计: {dataset_total} 个文件")
    total_images += dataset_total

print("\n" + "-" * 50)
print(f"数据集总计: {total_images} 个文件")

# 显示良恶性分布
benign_total = sum(counts.get(f"{d}/{t}/benign", 0) for d in datasets for t in types)
malignant_total = sum(counts.get(f"{d}/{t}/malignant", 0) for d in datasets for t in types)

print(f"\n良恶性分布:")
print(f"  - 良性(benign): {benign_total} 个文件 ({benign_total/total_images*100:.1f}%)")
print(f"  - 恶性(malignant): {malignant_total} 个文件 ({malignant_total/total_images*100:.1f}%)")

Dmass数据集文件计数：
--------------------------------------------------

TRAIN 集合:
  images:
    - benign: 508 个文件
    - malignant: 478 个文件
    小计: 986 个文件
  masks:
    - benign: 508 个文件
    - malignant: 478 个文件
    小计: 986 个文件
  train 总计: 1972 个文件

TEST 集合:
  images:
    - benign: 218 个文件
    - malignant: 143 个文件
    小计: 361 个文件
  masks:
    - benign: 218 个文件
    - malignant: 143 个文件
    小计: 361 个文件
  test 总计: 722 个文件

VAL 集合:
  images:
    - benign: 126 个文件
    - malignant: 119 个文件
    小计: 245 个文件
  masks:
    - benign: 126 个文件
    - malignant: 119 个文件
    小计: 245 个文件
  val 总计: 490 个文件

--------------------------------------------------
数据集总计: 3184 个文件

良恶性分布:
  - 良性(benign): 1704 个文件 (53.5%)
  - 恶性(malignant): 1480 个文件 (46.5%)
