In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox, yeojohnson

from PIL import Image
import cv2
import os

import warnings
warnings.filterwarnings('ignore')

# Exploratory Data Analysis (EDA)

In [2]:
# Read the dataset
ROOT_DATASET_DIR = "../isic-2024-challenge"
#image_path=os.path.join(ROOT_DATASET_DIR,"train-image","image")
file_name = os.path.join(ROOT_DATASET_DIR,"test-metadata.csv")
df_raw = pd.read_csv(file_name)

In [3]:
# Analyze de features
df_raw.head(5)

Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,0.304827,1.281532,2.299935,0.479339,20,-155.0651,1511.222,113.9801,Memorial Sloan Kettering Cancer Center,CC-BY
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,0.0,1.27194,2.011223,0.42623,25,-112.36924,629.535889,-15.019287,"Frazer Institute, The University of Queensland...",CC-BY
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,0.230742,1.080308,2.705857,0.366071,110,-84.29282,1303.978,-28.57605,FNQH Cairns,CC-BY


55 features in total.

In [4]:
stats = df_raw.describe().T
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age_approx,3.0,48.333333,15.275252,35.0,40.0,45.0,55.0,65.0
clin_size_long_diam_mm,3.0,2.793333,0.330051,2.52,2.61,2.7,2.93,3.16
tbp_lv_A,3.0,21.235613,4.037983,16.64867,19.7265,22.80433,23.529085,24.25384
tbp_lv_Aext,3.0,16.534205,5.955101,9.657964,14.797672,19.93738,19.972325,20.00727
tbp_lv_B,3.0,30.055107,1.508777,28.38412,29.4239,30.46368,30.8906,31.31752
tbp_lv_Bext,3.0,27.650733,0.679182,27.04364,27.283979,27.524318,27.954279,28.38424
tbp_lv_C,3.0,36.939135,1.795292,35.46781,35.938953,36.4101,37.6748,38.9395
tbp_lv_Cext,3.0,32.498746,2.930254,29.16958,31.40479,33.64,34.16333,34.68666
tbp_lv_H,3.0,54.900061,6.153928,51.22096,51.347845,51.47473,56.739612,62.004494
tbp_lv_Hext,3.0,59.695153,9.525959,53.50543,54.21042,54.91541,62.790014,70.664619


In [5]:
NumSamples = df_raw.shape[0]
NumFeatures = df_raw.shape[1]
print(f"Number of samples: {NumSamples}")
print(f"Number of features: {NumFeatures}")

Number of samples: 3
Number of features: 44


In [6]:
# Analyze data types.
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   isic_id                      3 non-null      object 
 1   patient_id                   3 non-null      object 
 2   age_approx                   3 non-null      float64
 3   sex                          3 non-null      object 
 4   anatom_site_general          3 non-null      object 
 5   clin_size_long_diam_mm       3 non-null      float64
 6   image_type                   3 non-null      object 
 7   tbp_tile_type                3 non-null      object 
 8   tbp_lv_A                     3 non-null      float64
 9   tbp_lv_Aext                  3 non-null      float64
 10  tbp_lv_B                     3 non-null      float64
 11  tbp_lv_Bext                  3 non-null      float64
 12  tbp_lv_C                     3 non-null      float64
 13  tbp_lv_Cext             

### Removing irrelevant features

In [7]:
columns_to_be_dropped = ['patient_id','image_type','attribution','copyright_license']
df_dropped = df_raw.drop(columns=columns_to_be_dropped)

In [8]:
# Function to print NaN values only
def print_NaNs(df):
    nan_list = df.isna().sum()
    if nan_list.sum() == 0:
        print("The dataframe contains no NaN values")
    else:
        return nan_list[nan_list != 0]

print_NaNs(df_dropped)

The dataframe contains no NaN values


### Converting data types

In [9]:
# Convert target, age, and tile_type into boolean (or int with 0 and 1)

# sex -> 0: male, 1: female
df_dropped['sex'] = df_dropped['sex'].apply(lambda x: 0 if x == 'male' else 1)
df_dropped['sex'] = df_dropped['sex'].astype(int)

# tdb_tile_type -> 0: white, 1: XP
df_dropped['tbp_tile_type'] = df_dropped['tbp_tile_type'].apply(lambda x: 0 if x == '3D: white' else 1)
df_dropped['tbp_tile_type'] = df_dropped['tbp_tile_type'].astype(int)

In [10]:
# Convert anatom_site_general, tbp_lv_location, and tbp_lv_location_simple into categorical

df_dropped['anatom_site_general'] = pd.Categorical(df_dropped['anatom_site_general'])
df_dropped['tbp_lv_location'] = pd.Categorical(df_dropped['tbp_lv_location'])
df_dropped['tbp_lv_location_simple'] = pd.Categorical(df_dropped['tbp_lv_location_simple'])

In [11]:
df_dropped.dtypes

isic_id                          object
age_approx                      float64
sex                               int32
anatom_site_general            category
clin_size_long_diam_mm          float64
tbp_tile_type                     int32
tbp_lv_A                        float64
tbp_lv_Aext                     float64
tbp_lv_B                        float64
tbp_lv_Bext                     float64
tbp_lv_C                        float64
tbp_lv_Cext                     float64
tbp_lv_H                        float64
tbp_lv_Hext                     float64
tbp_lv_L                        float64
tbp_lv_Lext                     float64
tbp_lv_areaMM2                  float64
tbp_lv_area_perim_ratio         float64
tbp_lv_color_std_mean           float64
tbp_lv_deltaA                   float64
tbp_lv_deltaB                   float64
tbp_lv_deltaL                   float64
tbp_lv_deltaLB                  float64
tbp_lv_deltaLBnorm              float64
tbp_lv_eccentricity             float64


### Applying columns transformers

In [13]:
# According to the above table, the following features require log transform and square transform to correct skewness.
# Note that the features with negative values will not be transformed in order not to loose information (negative sign)
features_to_be_logtr = ['clin_size_long_diam_mm',
                        'tbp_lv_areaMM2',
                        'tbp_lv_area_perim_ratio',
                        'tbp_lv_color_std_mean',
                        'tbp_lv_deltaLB',
                        'tbp_lv_deltaLBnorm',
                        'tbp_lv_minorAxisMM',
                        'tbp_lv_norm_border',
                        'tbp_lv_norm_color',
                        'tbp_lv_perimeterMM',
                        'tbp_lv_radial_color_std_max',
                        'tbp_lv_stdL',
                        'tbp_lv_stdLExt',
                        'tbp_lv_symm_2axis']
features_to_be_sqrtr = ['tbp_lv_eccentricity']

# Let's apply
df_log_features = df_dropped[features_to_be_logtr].apply(lambda x : np.log1p(x))
df_sqr_features = df_dropped[features_to_be_sqrtr].apply(lambda x : np.square(x))

# Modify the column names
df_log_features.columns = ['log_' + col for col in features_to_be_logtr]
df_sqr_features.columns = ['sqr_' + col for col in features_to_be_sqrtr]

# Merge dataframe
df_dropped_log_sqr_corr = pd.concat([df_log_features, df_sqr_features], axis=1)

In [14]:
# Final dataframes

df_eda = pd.concat([df_dropped, df_dropped_log_sqr_corr], axis=1)
print(f"Colums df_dropped: {df_dropped.shape[1]}")
print(f"Colums df_dropped_log_sqr_cor: {df_dropped_log_sqr_corr.shape[1]}")
print(f"Colums df_eda: {df_eda.shape[1]}")

df_eda.to_csv("test-metadata-eda.csv")

Colums df_dropped: 40
Colums df_dropped_log_sqr_cor: 15
Colums df_eda: 55
