In [1]:
# original data in data_original/image 
# processed data will be in train/ val/ and test/
import os
# tqdm for notebook
from tqdm import tqdm

from glob import glob

# read all dataset file names:
image_files = glob("data_original/image/*")
mask_files = glob("data_original/mask/*")

print(f"Total images found: {len(image_files)}")
print(f"Total masks found: {len(mask_files)}")


Total images found: 5240
Total masks found: 5240


In [2]:
# assert that names are matching 1 to 1
assert len(image_files) == len(mask_files), "Number of images and masks do not match"
image_basenames = set([os.path.basename(f).split('.')[0] for f in image_files])
mask_basenames = set([os.path.basename(f).split('.')[0] for f in mask_files])
assert image_basenames == mask_basenames, "Image and mask filenames do not match"

for fname in tqdm(image_basenames):
    img_path = f"data_original/image/{fname}.png"
    mask_path = f"data_original/mask/{fname}.png"
    
    # Simple check to ensure files exist
    assert os.path.exists(img_path), f"Image file missing: {img_path}"
    assert os.path.exists(mask_path), f"Mask file missing: {mask_path}"

100%|██████████| 5240/5240 [00:20<00:00, 256.52it/s]


In [3]:
# print all patient ids
print("All patient IDs:")
ids = set()
for fname in sorted(image_basenames):
    patient_id = fname.split('-')[0]
    ids.add(patient_id)
ids = sorted(list(ids))
print(ids)
print(f"Total unique patients: {len(ids)}")

All patient IDs:
['004', '006', '007', '040', '042', '047', '055', '057', '068', '075', '090', '094', '105', '117', '129', '141', '160', '163', '170', '175', '180', '188', '191', '196', '199', '238', '241', '260', '267', '275', '277', '290', '305', '308', '312', '315', '317', '320', '323', '324', '325', '350', '366', '375', '383', '386', '387', '389', '391', '392', '400', '414', '424', '426', '437', '438', '441', '443', '490', '495', '502', '531', '550', '570', '572', '605', '612', '661', '666', '673', '686', '692', '699', '718', '743', '744', '747', '752', '778', '789', '790', '807', '822', '832', '837', '840', '841', '842', '844', '845', '852', '863', '865', '881', '886', '890', '900', '918', '919', '943', '945', '948', '975', '977', '987', '989', '990']
Total unique patients: 107


In [4]:
# how many images per patient?
for pid in ids:
    count = sum(1 for fname in image_basenames if fname.startswith(pid + '-'))
    print(f"Patient {pid} has {count} images.")

Patient 004 has 49 images.
Patient 006 has 49 images.
Patient 007 has 49 images.
Patient 040 has 49 images.
Patient 042 has 49 images.
Patient 047 has 49 images.
Patient 055 has 49 images.
Patient 057 has 49 images.
Patient 068 has 49 images.
Patient 075 has 49 images.
Patient 090 has 49 images.
Patient 094 has 49 images.
Patient 105 has 49 images.
Patient 117 has 49 images.
Patient 129 has 49 images.
Patient 141 has 49 images.
Patient 160 has 49 images.
Patient 163 has 49 images.
Patient 170 has 49 images.
Patient 175 has 49 images.
Patient 180 has 49 images.
Patient 188 has 49 images.
Patient 191 has 49 images.
Patient 196 has 49 images.
Patient 199 has 49 images.
Patient 238 has 49 images.
Patient 241 has 49 images.
Patient 260 has 49 images.
Patient 267 has 49 images.
Patient 275 has 49 images.
Patient 277 has 49 images.
Patient 290 has 49 images.
Patient 305 has 49 images.
Patient 308 has 49 images.
Patient 312 has 49 images.
Patient 315 has 49 images.
Patient 317 has 49 images.
P

In [5]:
train_ids = ids[0:75]  # First 75 patients for training
val_ids = ids[75:86]  # Next 11 patients for validation
test_ids = ids[86:]   # Last 21 patients for testing

train_imgs = 0
val_imgs = 0
test_imgs = 0

for fname in tqdm(image_basenames):
    patient_id = fname.split('-')[0]
    img_path = f"data_original/image/{fname}.png"
    mask_path = f"data_original/mask/{fname}.png"
    
    if patient_id in train_ids:
        train_imgs += 1
    elif patient_id in val_ids:
        val_imgs += 1
    elif patient_id in test_ids:
        test_imgs += 1

print(f"Total training images: {train_imgs}")
print(f"Total validation images: {val_imgs}")
print(f"Total testing images: {test_imgs}")

100%|██████████| 5240/5240 [00:00<00:00, 238398.03it/s]

Total training images: 3672
Total validation images: 539
Total testing images: 1029





In [6]:
# create directories if not exist
os.makedirs("train/image", exist_ok=True)
os.makedirs("train/mask", exist_ok=True)
os.makedirs("val/image", exist_ok=True)
os.makedirs("val/mask", exist_ok=True)
os.makedirs("test/image", exist_ok=True)
os.makedirs("test/mask", exist_ok=True)

for fname in tqdm(image_basenames):
    patient_id = fname.split('-')[0]
    img_path = f"data_original/image/{fname}.png"
    mask_path = f"data_original/mask/{fname}.png"
    
    if patient_id in train_ids:
        dest_img_path = f"train/image/{fname}.png"
        dest_mask_path = f"train/mask/{fname}.png"
    elif patient_id in val_ids:
        dest_img_path = f"val/image/{fname}.png"
        dest_mask_path = f"val/mask/{fname}.png"
    elif patient_id in test_ids:
        dest_img_path = f"test/image/{fname}.png"
        dest_mask_path = f"test/mask/{fname}.png"
    
    # copy files
    os.system(f"cp {img_path} {dest_img_path}")
    os.system(f"cp {mask_path} {dest_mask_path}")

print("Data split completed.")

100%|██████████| 5240/5240 [04:52<00:00, 17.91it/s]

Data split completed.



