In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
ISIC_GT_PATH = "./datasets/ISIC_2019_Training_GroundTruth.csv"
ISIC_IMAGES_DIR = "./datasets/ISIC_2019_Training_Input"

HAM_META_PATH = "./datasets/HAM10000_metadata.csv"
HAM_IMAGES_DIR = "./datasets/HAM10000_images"

OUTPUT_CSV = "./datasets/merged_ham_isic_3class.csv"

In [None]:
isic_df = pd.read_csv(ISIC_GT_PATH)
isic_df.head()

In [None]:
isic_class_counts = isic_df.iloc[:, 1:].sum()
isic_class_counts

In [None]:
isic_filtered = isic_df[isic_df[["MEL", "BCC", "AK"]].sum(axis=1) == 1].copy()
isic_filtered = isic_filtered[["image"] + ["MEL", "BCC", "AK"]]
isic_filtered["source"] = "ISIC2019"

print(isic_filtered[["MEL", "BCC", "AK"]].sum())

In [None]:
ham_df = pd.read_csv(HAM_META_PATH)
ham_df.head()

In [None]:
ham_df["dx"].value_counts()

In [None]:
HAM_TO_ISIC_MAP = {
    "akiec": "AK",
    "bcc": "BCC",
    "mel": "MEL"
}

In [None]:
ham_target_classes = list(HAM_TO_ISIC_MAP.keys())
ham_filtered = ham_df[ham_df["dx"].isin(ham_target_classes)].copy()

ham_filtered["dx_mapped"] = ham_filtered["dx"].map(HAM_TO_ISIC_MAP)

ham_filtered["dx_mapped"].value_counts()

In [None]:
ham_onehot = pd.get_dummies(ham_filtered["dx_mapped"], dtype=float)

for col in ["MEL", "BCC", "AK"]:
    if col not in ham_onehot.columns:
        ham_onehot[col] = 0.0

ham_onehot = ham_onehot[["MEL", "BCC", "AK"]]

ham_formatted = pd.DataFrame({
    "image": ham_filtered["image_id"].values
})
ham_formatted = pd.concat([ham_formatted, ham_onehot.reset_index(drop=True)], axis=1)
ham_formatted["source"] = "HAM10000"

ham_formatted.head()

In [None]:
isic_images = set(isic_filtered["image"])
ham_images = set(ham_formatted["image"])

overlap = isic_images.intersection(ham_images)
print(f"Overlapping images: {len(overlap)}")

ham_formatted = ham_formatted[~ham_formatted["image"].isin(overlap)]

In [None]:
merged_df = pd.concat([isic_filtered, ham_formatted], ignore_index=True)

In [None]:
print(merged_df.groupby("source")[["MEL", "BCC", "AK"]].sum())