## Import Packages

In [7]:
import kagglehub
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler


## Load the dataset

In [8]:
metadata_df = pd.read_csv('data/andrewmvd/isic-2019/versions/1/ISIC_2019_Training_Metadata.csv')


print("ISIC_2019_Training_Metadata.csv")
print(metadata_df.head(),"\n")
print("Number of rows in MetaData CSV:", len(metadata_df))

ISIC_2019_Training_Metadata.csv
          image  age_approx anatom_site_general lesion_id     sex
0  ISIC_0000000        55.0      anterior torso       NaN  female
1  ISIC_0000001        30.0      anterior torso       NaN  female
2  ISIC_0000002        60.0     upper extremity       NaN  female
3  ISIC_0000003        30.0     upper extremity       NaN    male
4  ISIC_0000004        80.0     posterior torso       NaN    male 

Number of rows in MetaData CSV: 25331


In [9]:
print(metadata_df.isnull().sum())

image                     0
age_approx              437
anatom_site_general    2631
lesion_id              2084
sex                     384
dtype: int64


In [10]:
metadata_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   image                25331 non-null  object 
 1   age_approx           24894 non-null  float64
 2   anatom_site_general  22700 non-null  object 
 3   lesion_id            23247 non-null  object 
 4   sex                  24947 non-null  object 
dtypes: float64(1), object(4)
memory usage: 989.6+ KB


In [11]:
metadata_clean = metadata_df.copy()

## Cleaning the data

In [12]:
metadata_clean['image'].duplicated().sum()


np.int64(0)

In [13]:

# Fill missing values in 'anatom_site_general' with 'unknown'
metadata_clean['anatom_site_general'] = metadata_clean['anatom_site_general'].fillna('unknown')
# Convert 'anatom_site_general' to categorical type

metadata_clean['anatom_site_general'] = metadata_clean['anatom_site_general'].astype('category')

In [14]:
# Fill missing values in 'age_approx' with the median age
median_age = metadata_clean['age_approx'].median()
metadata_clean['age_approx'] = metadata_clean['age_approx'].fillna(median_age)
# Standardize 'age_approx' using StandardScaler
scaler = StandardScaler()
metadata_clean['age_approx'] = scaler.fit_transform(metadata_clean[['age_approx']])

In [15]:
# Check for duplicates in 'lesion_id'
lesion_counts = metadata_clean['lesion_id'].value_counts()
# Identify lesions with multiple images
duplicates = lesion_counts[lesion_counts > 1]
len(duplicates)
#Fill missing values in 'lesion_id' with 'unknown'
metadata_clean['lesion_id'] = metadata_clean['lesion_id'].fillna('unknown')


In [16]:
# Fill missing values in 'sex' with 'unknown'
metadata_clean['sex'] = metadata_clean['sex'].fillna('unknown')


In [17]:
print(metadata_clean.isnull().sum())


image                  0
age_approx             0
anatom_site_general    0
lesion_id              0
sex                    0
dtype: int64


In [18]:
metadata_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   image                25331 non-null  object  
 1   age_approx           25331 non-null  float64 
 2   anatom_site_general  25331 non-null  category
 3   lesion_id            25331 non-null  object  
 4   sex                  25331 non-null  object  
dtypes: category(1), float64(1), object(3)
memory usage: 816.8+ KB


In [19]:
# Convert 'sex' and 'anatom_site_general' to one-hot encoded variables
metadata_clean = pd.get_dummies(metadata_clean, columns=['sex', 'anatom_site_general'])
metadata_clean.head()


Unnamed: 0,image,age_approx,lesion_id,sex_female,sex_male,sex_unknown,anatom_site_general_anterior torso,anatom_site_general_head/neck,anatom_site_general_lateral torso,anatom_site_general_lower extremity,anatom_site_general_oral/genital,anatom_site_general_palms/soles,anatom_site_general_posterior torso,anatom_site_general_unknown,anatom_site_general_upper extremity
0,ISIC_0000000,0.053119,unknown,True,False,False,True,False,False,False,False,False,False,False,False
1,ISIC_0000001,-1.33778,unknown,True,False,False,True,False,False,False,False,False,False,False,False
2,ISIC_0000002,0.331299,unknown,True,False,False,False,False,False,False,False,False,False,False,True
3,ISIC_0000003,-1.33778,unknown,False,True,False,False,False,False,False,False,False,False,False,True
4,ISIC_0000004,1.444018,unknown,False,True,False,False,False,False,False,False,False,True,False,False


In [22]:
metadata_clean_final = metadata_clean[metadata_clean['lesion_id'] != 'unknown'].copy()
metadata_clean_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23247 entries, 1459 to 25330
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   image                                23247 non-null  object 
 1   age_approx                           23247 non-null  float64
 2   lesion_id                            23247 non-null  object 
 3   sex_female                           23247 non-null  bool   
 4   sex_male                             23247 non-null  bool   
 5   sex_unknown                          23247 non-null  bool   
 6   anatom_site_general_anterior torso   23247 non-null  bool   
 7   anatom_site_general_head/neck        23247 non-null  bool   
 8   anatom_site_general_lateral torso    23247 non-null  bool   
 9   anatom_site_general_lower extremity  23247 non-null  bool   
 10  anatom_site_general_oral/genital     23247 non-null  bool   
 11  anatom_site_general_palms/sole

## Save as csv

In [23]:
metadata_clean_final.to_csv('cleaned_data/ISIC_2019_Training_Metadata_Cleaned.csv', index=False)