In [None]:
import pandas as pd
from pathlib import Path
import shutil
import os
import numpy as np

In [72]:
df_gt = pd.read_csv("/home2/lu2277di/data/ISIC2024/ISIC_2024_Training_GroundTruth.csv")
df = pd.read_csv("/home2/lu2277di/data/ISIC2024/ISIC_2024_Training_Supplement.csv")
print("gt csv:", df_gt.sample(2))
print("supplement csv:", df.sample(2))

gt csv:              isic_id  malignant
372497  ISIC_9291330        0.0
39196   ISIC_1038667        0.0
supplement csv:              isic_id                                        attribution  \
305035  ISIC_7628914  Frazer Institute, The University of Queensland...   
157085  ISIC_3970293             Memorial Sloan Kettering Cancer Center   

       copyright_license lesion_id iddx_full  iddx_1 iddx_2 iddx_3 iddx_4  \
305035             CC-BY       NaN    Benign  Benign    NaN    NaN    NaN   
157085             CC-BY       NaN    Benign  Benign    NaN    NaN    NaN   

       iddx_5 mel_mitotic_index  mel_thick_mm  tbp_lv_dnn_lesion_confidence  
305035    NaN               NaN           NaN                     99.788719  
157085    NaN               NaN           NaN                     99.948597  


  df = pd.read_csv("/home2/lu2277di/data/ISIC2024/ISIC_2024_Training_Supplement.csv")


In [None]:
df["iddx_1"].unique()
malignant_df = df[df["iddx_1"] == "Malignant"]
indeterminate_df = df[df["iddx_1"] == "Indeterminate"]
benign_df = df[df["iddx_1"] == "Benign"]
malignant_df.shape, indeterminate_df.shape, benign_df.shape  # (393, 13), (114, 13), (400552, 13)

((393, 13), (114, 13), (400552, 13))

In [None]:
# To create a test set from a subset of the ISIC2024 training data:
# ------------------------------------------------------------------
# Paths
# ------------------------------------------------------------------
base_dir = Path("/home2/lu2277di/data/ISIC2024")

gt_path = base_dir / "ISIC_2024_Training_GroundTruth.csv"
supp_path = base_dir / "ISIC_2024_Training_Supplement.csv"
input_dir = base_dir / "ISIC_2024_Training_Input"

tda_dir = base_dir / "TDA"
out_img_dir = tda_dir / "Test_Input"
out_gt_dir = tda_dir / "Test_GroundTruth"

out_img_dir.mkdir(parents=True, exist_ok=True)
out_gt_dir.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------------
# 1. Load CSVs
# ------------------------------------------------------------------
df_gt = pd.read_csv(gt_path)        # columns: isic_id, malignant
df_supp = pd.read_csv(supp_path)    # columns incl: isic_id, iddx_1, ...

print("Ground truth columns:", df_gt.columns.tolist())
print("Supplement columns:", df_supp.columns.tolist())

# ------------------------------------------------------------------
# 2. Split by iddx_1 (using supplement for diagnosis type)
# ------------------------------------------------------------------
malignant_df = df_supp[df_supp["iddx_1"] == "Malignant"]
benign_df = df_supp[df_supp["iddx_1"] == "Benign"]
indeterminate_df = df_supp[df_supp["iddx_1"] == "Indeterminate"]

print("Counts from supplement:")
print("  Malignant     :", len(malignant_df))
print("  Benign        :", len(benign_df))
print("  Indeterminate :", len(indeterminate_df))

# ------------------------------------------------------------------
# 3. Sample 1500 benign cases (no indeterminate)
# ------------------------------------------------------------------
benign_sample_df = benign_df.sample(n=1500, random_state=0)

# Combine all malignant + sampled benign (based on supplement)
subset_supp = pd.concat([malignant_df, benign_sample_df], axis=0)
subset_supp = subset_supp.reset_index(drop=True)

print("Total selected (supplement-based):", len(subset_supp))

# ------------------------------------------------------------------
# 4. Use isic_id to select corresponding rows from the GROUND TRUTH CSV
# ------------------------------------------------------------------
selected_ids = subset_supp["isic_id"].unique()
subset_gt = df_gt[df_gt["isic_id"].isin(selected_ids)].copy()

print("Rows found in ground truth for selected IDs:", len(subset_gt))

# Sanity check: we expect same number unless some IDs are missing in gt
missing_in_gt = set(selected_ids) - set(subset_gt["isic_id"])
if missing_in_gt:
    print("WARNING: some isic_id present in supplement but missing in ground truth, e.g.:")
    print(list(missing_in_gt)[:10])

# ------------------------------------------------------------------
# 5. Copy corresponding images into TDA_Test_Input
#    Assumes image files are named "<isic_id>.jpg"
# ------------------------------------------------------------------
missing_images = []

for isic_id in selected_ids:
    src = input_dir / f"{isic_id}.jpg"
    dst = out_img_dir / f"{isic_id}.jpg"

    if src.is_file():
        shutil.copy2(src, dst)
    else:
        missing_images.append(str(src))

print(f"Copied {len(selected_ids) - len(missing_images)} images.")
if missing_images:
    print("WARNING: missing image files, e.g.:")
    print("\n".join(missing_images[:10]))

# ------------------------------------------------------------------
# 6. Save filtered GROUND TRUTH CSV into TDA_Test_GroundTruth
# ------------------------------------------------------------------
out_csv_path = out_gt_dir / "Test_GroundTruth.csv"
subset_gt.to_csv(out_csv_path, index=False)

print("Wrote ground truth CSV to:", out_csv_path)


Ground truth columns: ['isic_id', 'malignant']
Supplement columns: ['isic_id', 'attribution', 'copyright_license', 'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence']
Counts from supplement:
  Malignant     : 393
  Benign        : 400552
  Indeterminate : 114
Total selected (supplement-based): 1893
Rows found in ground truth for selected IDs: 1893
Copied 1893 images.
Wrote ground truth CSV to: /home2/lu2277di/data/ISIC2024/TDA/TDA_Test_GroundTruth/ISIC_2024_TDA_Test_GroundTruth.csv


  df_supp = pd.read_csv(supp_path)    # columns incl: isic_id, iddx_1, ...


In [None]:
#!/usr/bin/env python


ROOT = "/home2/lu2277di/data/ISIC2024"
TDA_ROOT = os.path.join(ROOT, "TDA")

TRAIN_GT_CSV = os.path.join(ROOT, "ISIC_2024_Training_GroundTruth.csv")
TRAIN_IMG_DIR = os.path.join(ROOT, "ISIC_2024_Training_Input")

TEST_GT_CSV = os.path.join(TDA_ROOT, "Test_GroundTruth", "Test_GroundTruth.csv")

OUT_IMG_DIR = os.path.join(TDA_ROOT, "Training_Input", "ISIC2024_Training_Input_Subsampled")
OUT_GT_CSV = os.path.join(TDA_ROOT, "Training_GroundTruth", "ISIC2024_Training_GroundTruth_Subsampled.csv")

N_SAMPLES = 10000
RANDOM_SEED = 42

os.makedirs(OUT_IMG_DIR, exist_ok=True)
os.makedirs(os.path.dirname(OUT_GT_CSV), exist_ok=True)

print("Loading training ground truth...")
train_df = pd.read_csv(TRAIN_GT_CSV)

print("Loading test ground truth...")
test_df = pd.read_csv(TEST_GT_CSV)

# Make sure we use the same column name convention
if "isic_id" not in train_df.columns:
    raise RuntimeError(f"'image' column not found in {TRAIN_GT_CSV}")
if "image" not in test_df.columns:
    raise RuntimeError(f"'image' column not found in {TEST_GT_CSV}")
if "malignant" not in train_df.columns:
    raise RuntimeError(f"'malignant' column not found in {TRAIN_GT_CSV}")

# Images that are in test set (to exclude from training sampling)
test_images = set(test_df["image"].astype(str))

# All benign train samples that are NOT in test set
benign_train = train_df[
    (train_df["malignant"] == 0.0) &
    (~train_df["isic_id"].astype(str).isin(test_images))
].copy()

print(f"Total benign training images (excluding test): {len(benign_train)}")

if len(benign_train) < N_SAMPLES:
    raise RuntimeError(
        f"Requested {N_SAMPLES} benign samples but only {len(benign_train)} available."
    )

# Sample 10k benign images
rng = np.random.default_rng(RANDOM_SEED)
sample_idx = rng.choice(benign_train.index.values, size=N_SAMPLES, replace=False)
benign_sample = benign_train.loc[sample_idx].reset_index(drop=True)

print(f"Sampling {len(benign_sample)} benign images.")

# Save corresponding GroundTruth CSV for the subsampled set
benign_sample.to_csv(OUT_GT_CSV, index=False)
print(f"Wrote subsampled GroundTruth to: {OUT_GT_CSV}")

# Copy images
missing = 0
for _, row in benign_sample.iterrows():
    image_id = str(row["isic_id"])
    # adjust extension if needed (ISIC usually .jpg)
    src = os.path.join(TRAIN_IMG_DIR, image_id + ".jpg")
    dst = os.path.join(OUT_IMG_DIR, image_id + ".jpg")

    if not os.path.exists(src):
        raise RuntimeError(f"Image file not found: {src}")
    shutil.copy2(src, dst)

print(f"Copied benign images to: {OUT_IMG_DIR}")
print(f"Missing files: {missing}")
print("Done.")


Loading training ground truth...
Loading test ground truth...
Total benign training images (excluding test): 399166
Sampling 10000 benign images.
Wrote subsampled GroundTruth to: /home2/lu2277di/data/ISIC2024/TDA/Training_GroundTruth/ISIC2024_Training_GroundTruth_Subsampled.csv
Copied benign images to: /home2/lu2277di/data/ISIC2024/TDA/Training_Input/ISIC2024_Training_Input_Subsampled
Missing files: 0
Done.
