In [1]:
# ==========================================================
# 04_preprocess_wsi.ipynb
# Purpose: Collect and preprocess all TCIA DICOM image metadata.
# Works on all available TCIA folders automatically.
# ==========================================================

import os
import pandas as pd
import SimpleITK as sitk
from pathlib import Path


In [2]:
# ----------------------------------------------------------
# Step 1: Define project paths
# ----------------------------------------------------------
os.chdir(r"C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project")

RAW_TCIA_DIR = Path("data/raw/tcia")
OUT_DIR = Path("data/processed/clinical")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_FILE = OUT_DIR / "tcia_series_metadata.csv"

In [3]:
# ----------------------------------------------------------
# Step 2: Discover all DICOM folders
# ----------------------------------------------------------
dcm_dirs = []
for r, d, f in os.walk(RAW_TCIA_DIR):
    if any(x.lower().endswith(".dcm") for x in f):
        dcm_dirs.append(Path(r))

if not dcm_dirs:
    raise SystemExit("‚ùå No DICOM files found. Make sure your TCIA dataset is fully downloaded.")

print(f"‚úÖ Found {len(dcm_dirs)} DICOM folders under {RAW_TCIA_DIR}")


‚úÖ Found 521 DICOM folders under data\raw\tcia


In [4]:
# ----------------------------------------------------------
# Step 3: Extract series metadata from each folder
# ----------------------------------------------------------
records = []

for dcm_dir in sorted(set(dcm_dirs)):
    try:
        series_ids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(dcm_dir))
        if not series_ids:
            continue

        for sid in series_ids:
            files = sitk.ImageSeriesReader.GetGDCMSeriesFileNames(str(dcm_dir), sid)
            reader = sitk.ImageFileReader()
            reader.SetFileName(files[0])
            reader.ReadImageInformation()

            meta = {
                "folder": str(dcm_dir),
                "series_id": sid,
                "patient_id": reader.GetMetaData("0010|0010") if reader.HasMetaDataKey("0010|0010") else None,
                "study_uid": reader.GetMetaData("0020|000d") if reader.HasMetaDataKey("0020|000d") else None,
                "series_description": reader.GetMetaData("0008|103e") if reader.HasMetaDataKey("0008|103e") else None,
                "modality": reader.GetMetaData("0008|0060") if reader.HasMetaDataKey("0008|0060") else None,
                "manufacturer": reader.GetMetaData("0008|0070") if reader.HasMetaDataKey("0008|0070") else None,
                "num_files": len(files),
            }
            records.append(meta)

    except Exception as e:
        print(f"‚ö†Ô∏è Skipping folder {dcm_dir}: {e}")


In [5]:
# ----------------------------------------------------------
# Step 4: Build combined metadata table
# ----------------------------------------------------------
if not records:
    raise SystemExit("‚ùå No valid DICOM metadata found in any folder.")

meta_df = pd.DataFrame(records)

# Clean and normalize
meta_df["patient_id"] = meta_df["patient_id"].astype(str).str.strip()
meta_df["series_id"] = meta_df["series_id"].astype(str).str.strip()

# Deduplicate
meta_df = meta_df.drop_duplicates(subset=["series_id"])

print(f"‚úÖ Extracted {len(meta_df)} unique DICOM series across all folders.")
print(meta_df.head(5))


‚úÖ Extracted 521 unique DICOM series across all folders.
                                              folder  \
0  data\raw\tcia\manifest-1669817128730\Colorecta...   
1  data\raw\tcia\manifest-1669817128730\Colorecta...   
2  data\raw\tcia\manifest-1669817128730\Colorecta...   
3  data\raw\tcia\manifest-1669817128730\Colorecta...   
4  data\raw\tcia\manifest-1669817128730\Colorecta...   

                                           series_id    patient_id  \
0  1.3.6.1.4.1.14519.5.2.1.9203.8273.370971589400...  CRLM-CT-1001   
1  1.3.6.1.4.1.14519.5.2.1.9203.8273.533669585389...  CRLM-CT-1001   
2  1.3.6.1.4.1.14519.5.2.1.9203.8273.302964673037...  CRLM-CT-1002   
3  1.3.6.1.4.1.14519.5.2.1.9203.8273.455038217313...  CRLM-CT-1002   
4  1.3.6.1.4.1.14519.5.2.1.9203.8273.233692783386...  CRLM-CT-1003   

                                           study_uid series_description  \
0  1.3.6.1.4.1.14519.5.2.1.9203.8273.123231034424...       Segmentation   
1  1.3.6.1.4.1.14519.5.2.1.9203.82

In [6]:
# ----------------------------------------------------------
# Step 5: Save results safely
# ----------------------------------------------------------
if OUT_FILE.exists():
    OUT_FILE.unlink()

meta_df.to_csv(OUT_FILE, index=False)
print(f"‚úÖ Saved metadata to: {OUT_FILE}")


‚úÖ Saved metadata to: data\processed\clinical\tcia_series_metadata.csv


In [7]:
# ----------------------------------------------------------
# Step 6: Sanity check: Required columns
# ----------------------------------------------------------
required_cols = ["patient_id", "series_id"]
missing = [c for c in required_cols if c not in meta_df.columns]
if missing:
    print(f"‚ö†Ô∏è Missing columns in output: {missing}")
else:
    print("‚úÖ All required columns present: patient_id, series_id")


‚úÖ All required columns present: patient_id, series_id


In [8]:
# ----------------------------------------------------------
# Step 7: Summary diagnostics
# ----------------------------------------------------------
print(f"üìä Unique patients in metadata: {meta_df['patient_id'].nunique()}")
print(f"üìä Unique series IDs: {meta_df['series_id'].nunique()}")
print(f"üìä Total DICOM folders scanned: {len(dcm_dirs)}")

üìä Unique patients in metadata: 225
üìä Unique series IDs: 521
üìä Total DICOM folders scanned: 521
