# Dataset proprocessing

In [13]:
%load_ext autoreload
%autoreload 2
    
import pandas as pd
import cv2
import mydata
from preprocessing import HAM10000
from skincolors import IndividualTypologyAngle
from measure import MeasureSkin

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## HAM10000

In [2]:
db = "HAM"
metafile = "HAM10000_metadata.csv"
root = f"../../dataset/AAAI 2025/{db}/"
df = pd.read_csv(root + metafile)

In [3]:
seg_directory = f"{root}ham_segmentation/"
save_directory = f"{root}mask/"

ham = HAM10000(root, df)
ham.update_dataset()
ham.check_segmentation(seg_directory)
df = ham.create_masked_image(save_directory)

../../dataset/AAAI 2025/HAM/ham_segmentation/ISIC_0034313_segmentation.png
../../dataset/AAAI 2025/HAM/ham_segmentation/ISIC_0034314_segmentation.png
Completed to make masked files


In [4]:
ita_values = []
skin_tones = []
for _, d in df.iterrows():
    img = cv2.imread(d["masked filepath"])
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    ita = IndividualTypologyAngle(rgb)
    ita_value = ita.get_mean_ita()
    ita_values.append(ita_value)
    
    skin_tone = ita.map_skin_tone(ita_value)
    skin_tones.append(skin_tone)

df["mean ita"] = ita_values
df["skin tone"] = skin_tones

  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  mean = np.nanmean(l)
  ita = math.atan2(np.nanmean(l) - 50, np.nanmean(b)) * (180 / np.pi)


In [5]:
skin_tone_counts = df["skin tone"].value_counts()
print(skin_tone_counts)

skin tone
1    6731
2     608
3     281
4     143
6      28
5      25
Name: count, dtype: int64


In [6]:
index = df[~df["skin tone"].isin(["1"])].index
df.drop(index=index, inplace=True)

In [7]:
ham.override_dataset(df)
ham.check_balance()

labels  skin tone
0       1            5647
1       1            1084
dtype: int64
1084


In [8]:
ham.override_dataset(df)
df_balanced = ham.balance_dataset()

In [9]:
df_train, df_valid, df_test = mydata.split_three_dataset(df_balanced)

Final sizes - train: 1300 validation: 434 test: 434
---train-------------
labels
0    650
1    650
dtype: int64
---valid-------------
labels
0    217
1    217
dtype: int64
---test-------------
labels
0    217
1    217
dtype: int64


In [10]:
print(df_train.groupby(["labels", "skin tone"]).size())

labels  skin tone
0       1            650
1       1            650
dtype: int64


In [11]:
df_train.to_csv(f"{root}/dataframe/df_train.csv", index=False)
df_valid.to_csv(f"{root}/dataframe/df_valid.csv", index=False)
df_test.to_csv(f"{root}/dataframe/df_test.csv", index=False)

# Measure Distance

In [14]:
ms = MeasureSkin()
ms.select_baseline_skin(df_valid)

BaseLine File Name: ../../dataset/AAAI 2025/HAM/mask/ISIC_0029363.jpg
Conventional ITA values: 73.92512124541418


In [15]:
df_new_train = ms.measure(df_train)
df_new_valid = ms.measure(df_valid)
df_new_test = ms.measure(df_test)

Completed: 1300
Completed: 434
Completed: 434


In [16]:
df_new_train.to_csv(f"{root}/dataframe/df_train.csv", index=False)
df_new_valid.to_csv(f"{root}/dataframe/df_valid.csv", index=False)
df_new_test.to_csv(f"{root}/dataframe/df_test.csv", index=False)