In [35]:
import pandas as pd
import numpy as np
import os

In [36]:
ISIC_GT_PATH = "./datasets/ISIC_2019_Training_GroundTruth.csv"
ISIC_IMAGES_DIR = "./datasets/ISIC_2019_Training_Input"

HAM_META_PATH = "./datasets/HAM10000_metadata.csv"
HAM_IMAGES_DIR = "./datasets/HAM10000_images"

OUTPUT_CSV = "./datasets/merged_ham_isic_3class.csv"

In [37]:
isic_df = pd.read_csv(ISIC_GT_PATH)
isic_df.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
isic_class_counts = isic_df.iloc[:, 1:].sum()
isic_class_counts

MEL      4522.0
NV      12875.0
BCC      3323.0
AK        867.0
BKL      2624.0
DF        239.0
VASC      253.0
SCC       628.0
UNK         0.0
dtype: float64

In [39]:
isic_filtered = isic_df[isic_df[["MEL", "BCC", "AK"]].sum(axis=1) == 1].copy()
isic_filtered = isic_filtered[["image"] + ["MEL", "BCC", "AK"]]
isic_filtered["source"] = "ISIC2019"

print(isic_filtered[["MEL", "BCC", "AK"]].sum())

MEL    4522.0
BCC    3323.0
AK      867.0
dtype: float64


In [40]:
ham_df = pd.read_csv(HAM_META_PATH)
ham_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [41]:
ham_df["dx"].value_counts()

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

In [42]:
HAM_TO_ISIC_MAP = {
    "akiec": "AK",
    "bcc": "BCC",
    "mel": "MEL"
}

In [43]:
ham_target_classes = list(HAM_TO_ISIC_MAP.keys())
ham_filtered = ham_df[ham_df["dx"].isin(ham_target_classes)].copy()

ham_filtered["dx_mapped"] = ham_filtered["dx"].map(HAM_TO_ISIC_MAP)

ham_filtered["dx_mapped"].value_counts()

dx_mapped
MEL    1113
BCC     514
AK      327
Name: count, dtype: int64

In [44]:
ham_onehot = pd.get_dummies(ham_filtered["dx_mapped"], dtype=float)

for col in ["MEL", "BCC", "AK"]:
    if col not in ham_onehot.columns:
        ham_onehot[col] = 0.0

ham_onehot = ham_onehot[["MEL", "BCC", "AK"]]

ham_formatted = pd.DataFrame({
    "image": ham_filtered["image_id"].values
})
ham_formatted = pd.concat([ham_formatted, ham_onehot.reset_index(drop=True)], axis=1)
ham_formatted["source"] = "HAM10000"

ham_formatted.head()

Unnamed: 0,image,MEL,BCC,AK,source
0,ISIC_0025964,1.0,0.0,0.0,HAM10000
1,ISIC_0030623,1.0,0.0,0.0,HAM10000
2,ISIC_0027190,1.0,0.0,0.0,HAM10000
3,ISIC_0031023,1.0,0.0,0.0,HAM10000
4,ISIC_0028086,1.0,0.0,0.0,HAM10000


In [45]:
isic_images = set(isic_filtered["image"])
ham_images = set(ham_formatted["image"])

overlap = isic_images.intersection(ham_images)
print(f"Overlapping images: {len(overlap)}")

ham_formatted = ham_formatted[~ham_formatted["image"].isin(overlap)]

Overlapping images: 1757


In [46]:
merged_df = pd.concat([isic_filtered, ham_formatted], ignore_index=True)

In [47]:
print(merged_df.groupby("source")[["MEL", "BCC", "AK"]].sum())

             MEL     BCC     AK
source                         
HAM10000     0.0     0.0  197.0
ISIC2019  4522.0  3323.0  867.0
