In [1]:
# --------------------------------------------------
# 05_prepare_full_index.ipynb
# Convert all TCIA DICOMs ‚Üí PNGs and build master index
# --------------------------------------------------

import os
from pathlib import Path
import pandas as pd
import pydicom
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
# --------------------------------------------------
# Step 1: Setup paths
# --------------------------------------------------
os.chdir(r"C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project")

RAW_DIR = Path("data/raw/tcia")
OUT_DIR = Path("data/processed/images/all_slices")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Clean existing directory
for old_file in OUT_DIR.glob("*.png"):
    old_file.unlink()


In [3]:
# --------------------------------------------------
# Step 2: Load clinical and metadata tables
# --------------------------------------------------
clinical_path = Path("data/processed/clinical/clinical_features_with_id.csv")
meta_path = Path("data/processed/clinical/tcia_series_metadata.csv")

if not clinical_path.exists() or not meta_path.exists():
    raise SystemExit("‚ùå Required input files missing. Please run 02_label_patients and 04_preprocess_wsi first.")

clinical_df = pd.read_csv(clinical_path)
meta_df = pd.read_csv(meta_path)

print(f"‚úÖ Loaded clinical data: {len(clinical_df)} patients")
print(f"‚úÖ Loaded imaging metadata: {len(meta_df)} series from {meta_df['patient_id'].nunique()} patients")


‚úÖ Loaded clinical data: 633 patients
‚úÖ Loaded imaging metadata: 521 series from 225 patients


In [4]:
# --------------------------------------------------
# Step 3: Merge clinical ‚Üî imaging metadata
# --------------------------------------------------
merged = meta_df.merge(clinical_df, on="patient_id", how="left")
print(f"üîó Merged metadata shape: {merged.shape}")

# Log label coverage
label_coverage = merged["metastasis_status"].notna().mean() * 100
print(f"üìä Label coverage after merge: {label_coverage:.1f}% of series have metastasis labels")


üîó Merged metadata shape: (521, 17)
üìä Label coverage after merge: 8.8% of series have metastasis labels


In [5]:
# --------------------------------------------------
# Step 4: Convert DICOMs ‚Üí PNGs
# --------------------------------------------------
dicom_files = list(RAW_DIR.rglob("*.dcm"))
print(f"üîç Found {len(dicom_files)} DICOM files under {RAW_DIR}")

records = []
bad_files = 0

for dcm_path in tqdm(dicom_files, desc="Converting DICOMs"):
    try:
        ds = pydicom.dcmread(dcm_path, force=True)
        img = ds.pixel_array

        # Normalize to 0‚Äì255
        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)
        img = img.astype("uint8")

        # Extract metadata
        patient_id = getattr(ds, "PatientID", None)
        series_id = getattr(ds, "SeriesInstanceUID", None)
        if patient_id is None or series_id is None:
            continue

        # Construct filename
        out_name = f"{patient_id}_{series_id}_{dcm_path.stem}.png"
        out_path = OUT_DIR / out_name
        cv2.imwrite(str(out_path), img)

        records.append({
            "patient_id": patient_id,
            "series_id": series_id,
            "slice_path": str(out_path).replace("\\", "/")
        })

    except Exception as e:
        bad_files += 1
        continue

print(f"‚úÖ Successfully converted {len(records)} DICOM slices ‚Üí PNGs")
print(f"‚ö†Ô∏è Skipped {bad_files} corrupted or unreadable files")


üîç Found 28019 DICOM files under data\raw\tcia


Converting DICOMs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28019/28019 [16:03<00:00, 29.07it/s] 

‚úÖ Successfully converted 27822 DICOM slices ‚Üí PNGs
‚ö†Ô∏è Skipped 197 corrupted or unreadable files





In [6]:
# --------------------------------------------------
# Step 5: Build image index
# --------------------------------------------------
img_df = pd.DataFrame(records)
img_df = img_df.drop_duplicates(subset=["slice_path"])

# Merge to include clinical labels
index_df = img_df.merge(merged, on=["patient_id", "series_id"], how="left")

print(f"‚úÖ Final index built: {len(index_df)} total image slices")
print(f"üìä Patients represented: {index_df['patient_id'].nunique()}")
print(f"üìä Series represented: {index_df['series_id'].nunique()}")


‚úÖ Final index built: 27822 total image slices
üìä Patients represented: 225
üìä Series represented: 324


In [7]:
# --------------------------------------------------
# Step 6: Save index file safely
# --------------------------------------------------
out_index = Path("data/processed/images/all_index.csv")
if out_index.exists():
    out_index.unlink()

index_df.to_csv(out_index, index=False)
print(f"‚úÖ Saved clean image index to: {out_index}")


‚úÖ Saved clean image index to: data\processed\images\all_index.csv


In [8]:
# --------------------------------------------------
# Step 7: Create new train/val/test splits (series-level)
# --------------------------------------------------
series_ids = index_df["series_id"].dropna().unique()
train_ids, temp_ids = train_test_split(series_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

split_dir = Path("data/splits")
split_dir.mkdir(parents=True, exist_ok=True)

pd.DataFrame({"series_id": train_ids}).to_csv(split_dir / "train_series.csv", index=False)
pd.DataFrame({"series_id": val_ids}).to_csv(split_dir / "val_series.csv", index=False)
pd.DataFrame({"series_id": test_ids}).to_csv(split_dir / "test_series.csv", index=False)

print("‚úÖ Train/Val/Test splits created under data/splits/")
print(f"   ‚û§ Train: {len(train_ids)} series")
print(f"   ‚û§ Val:   {len(val_ids)} series")
print(f"   ‚û§ Test:  {len(test_ids)} series")


‚úÖ Train/Val/Test splits created under data/splits/
   ‚û§ Train: 226 series
   ‚û§ Val:   49 series
   ‚û§ Test:  49 series


In [10]:
# --------------------------------------------------
# Step 8: Diagnostics summary
# --------------------------------------------------
print("\nüìã FINAL SUMMARY")
print(f"üß© Total image slices: {len(index_df)}")
print(f"üßç Unique patients: {index_df['patient_id'].nunique()}")
print(f"ü©ª Unique series: {index_df['series_id'].nunique()}")
print(f"üè∑Ô∏è Labeled slices: {index_df['metastasis_status'].notna().sum()} ({index_df['metastasis_status'].notna().mean()*100:.1f}%)")
print(f"üßæ Index saved to: {out_index}")


üìã FINAL SUMMARY
üß© Total image slices: 27822
üßç Unique patients: 225
ü©ª Unique series: 324
üè∑Ô∏è Labeled slices: 3408 (12.2%)
üßæ Index saved to: data\processed\images\all_index.csv
