## Import Packages

In [41]:
import kagglehub
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler


## Load the dataset

In [42]:
metadata_df = pd.read_csv('data/andrewmvd/isic-2019/versions/1/ISIC_2019_Training_Metadata.csv')


print("ISIC_2019_Training_Metadata.csv")
print(metadata_df.head(),"\n")
print("Number of rows in MetaData CSV:", len(metadata_df))

ISIC_2019_Training_Metadata.csv
          image  age_approx anatom_site_general lesion_id     sex
0  ISIC_0000000        55.0      anterior torso       NaN  female
1  ISIC_0000001        30.0      anterior torso       NaN  female
2  ISIC_0000002        60.0     upper extremity       NaN  female
3  ISIC_0000003        30.0     upper extremity       NaN    male
4  ISIC_0000004        80.0     posterior torso       NaN    male 

Number of rows in MetaData CSV: 25331


In [43]:
print(metadata_df.isnull().sum())

image                     0
age_approx              437
anatom_site_general    2631
lesion_id              2084
sex                     384
dtype: int64


In [44]:
metadata_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   image                25331 non-null  object 
 1   age_approx           24894 non-null  float64
 2   anatom_site_general  22700 non-null  object 
 3   lesion_id            23247 non-null  object 
 4   sex                  24947 non-null  object 
dtypes: float64(1), object(4)
memory usage: 989.6+ KB


In [45]:
metadata_clean = metadata_df.copy()

## Cleaning the data

In [46]:
metadata_clean['image'].duplicated().sum()


np.int64(0)

In [47]:

# Fill missing values in 'anatom_site_general' with 'unknown'
metadata_clean['anatom_site_general'] = metadata_clean['anatom_site_general'].fillna('unknown')
# Convert 'anatom_site_general' to categorical type

metadata_clean['anatom_site_general'] = metadata_clean['anatom_site_general'].astype('category')

In [48]:
# Fill missing values in 'age_approx' with the median age
median_age = metadata_clean['age_approx'].median()
metadata_clean['age_approx'] = metadata_clean['age_approx'].fillna(median_age)
# Standardize 'age_approx' using StandardScaler
scaler = StandardScaler()
metadata_clean['age_approx'] = scaler.fit_transform(metadata_clean[['age_approx']])

In [49]:
list_of_images_for_removing = metadata_clean[metadata_clean['lesion_id'].isna()]['image'].tolist()
print("Removing images with missing lesion_id:", list_of_images_for_removing)
print("Number of images with missing lesion_id:", len(list_of_images_for_removing))
metadata_clean = metadata_clean.dropna(subset=['lesion_id'])
print("Number of rows after cleaning:", len(metadata_clean))


Removing images with missing lesion_id: ['ISIC_0000000', 'ISIC_0000001', 'ISIC_0000002', 'ISIC_0000003', 'ISIC_0000004', 'ISIC_0000006', 'ISIC_0000007', 'ISIC_0000008', 'ISIC_0000009', 'ISIC_0000010', 'ISIC_0000011', 'ISIC_0000012', 'ISIC_0000013', 'ISIC_0000014', 'ISIC_0000015', 'ISIC_0000016', 'ISIC_0000017_downsampled', 'ISIC_0000018_downsampled', 'ISIC_0000019_downsampled', 'ISIC_0000020_downsampled', 'ISIC_0000021_downsampled', 'ISIC_0000022_downsampled', 'ISIC_0000023_downsampled', 'ISIC_0000024_downsampled', 'ISIC_0000025_downsampled', 'ISIC_0000026_downsampled', 'ISIC_0000027_downsampled', 'ISIC_0000028_downsampled', 'ISIC_0000029_downsampled', 'ISIC_0000030_downsampled', 'ISIC_0000031_downsampled', 'ISIC_0000032_downsampled', 'ISIC_0000034_downsampled', 'ISIC_0000035_downsampled', 'ISIC_0000036_downsampled', 'ISIC_0000037_downsampled', 'ISIC_0000038_downsampled', 'ISIC_0000039_downsampled', 'ISIC_0000040_downsampled', 'ISIC_0000041_downsampled', 'ISIC_0000042_downsampled', 'IS

In [50]:
with open('to_delete.txt', 'w') as f:
    for img in list_of_images_for_removing:
        f.write(f"{img}\n")

In [51]:
# Fill missing values in 'sex' with 'unknown'
metadata_clean['sex'] = metadata_clean['sex'].fillna('unknown')


In [52]:
print(metadata_clean.isnull().sum())


image                  0
age_approx             0
anatom_site_general    0
lesion_id              0
sex                    0
dtype: int64


In [53]:
metadata_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23247 entries, 1459 to 25330
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   image                23247 non-null  object  
 1   age_approx           23247 non-null  float64 
 2   anatom_site_general  23247 non-null  category
 3   lesion_id            23247 non-null  object  
 4   sex                  23247 non-null  object  
dtypes: category(1), float64(1), object(3)
memory usage: 931.2+ KB


In [54]:
# Convert 'sex' and 'anatom_site_general' to one-hot encoded variables
metadata_clean = pd.get_dummies(metadata_clean, columns=['sex', 'anatom_site_general'])
metadata_clean.head()


Unnamed: 0,image,age_approx,lesion_id,sex_female,sex_male,sex_unknown,anatom_site_general_anterior torso,anatom_site_general_head/neck,anatom_site_general_lateral torso,anatom_site_general_lower extremity,anatom_site_general_oral/genital,anatom_site_general_palms/soles,anatom_site_general_posterior torso,anatom_site_general_unknown,anatom_site_general_upper extremity
1459,ISIC_0012653_downsampled,-0.225061,MSK4_0011169,True,False,False,False,False,False,False,False,False,True,False,False
1460,ISIC_0012654_downsampled,-1.33778,MSK4_0011170,True,False,False,False,False,False,True,False,False,False,False,False
1461,ISIC_0012655_downsampled,-1.059601,MSK4_0011171,True,False,False,False,False,False,False,False,False,False,False,True
1462,ISIC_0012656_downsampled,-0.503241,MSK4_0011172,False,True,False,False,False,False,False,False,False,True,False,False
1463,ISIC_0012657_downsampled,-1.89414,MSK4_0011173,True,False,False,False,False,False,False,False,False,False,False,True


## Save as csv

In [55]:
metadata_clean.to_csv('cleaned_data/ISIC_2019_Training_Metadata_Cleaned.csv', index=False)