# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
from collections import Counter
import pydicom
import cv2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns

In [3]:
train_gt = pd.read_csv("/kaggle/input/isic-2019-challenge/ISIC_2019_Training_GroundTruth.csv")
test_df  = pd.read_csv("/kaggle/input/isic-2019-challenge/ISIC_2019_Test_Metadata.csv")
train_df  = pd.read_csv("/kaggle/input/isic-2019-challenge/ISIC_2019_Training_Metadata.csv")

In [4]:
train_GT_transformed = pd.melt(train_gt, id_vars='image', var_name='diagnosis', value_name='value').\
    sort_values('image').\
    reset_index(drop=True)

# Select rows when diagnosis == 1
train_GT_transformed = train_GT_transformed[train_GT_transformed['value'] == 1]
train_GT_transformed.drop('value', inplace=True, axis = 1)
train_GT_transformed.reset_index(inplace = True, drop = True)

# Test groundtruth

In [19]:
test_DT = pd.read_csv('/kaggle/input/eficient_sincancer/pytorch/default/1/ISIC_2019_Test_GroundTruth.csv')

In [20]:
test_DT.drop('validation_weight', inplace=True, axis = 1)
test_DT.drop('score_weight', inplace=True, axis = 1)

In [21]:
test_dt =  pd.melt(test_DT, id_vars='image', var_name='diagnosis', value_name='value').\
    sort_values('image').\
    reset_index(drop=True)

# Select rows when diagnosis == 1
test_dt = test_dt[test_dt['value'] == 1]
test_dt.drop('value', inplace=True, axis = 1)
test_dt.reset_index(inplace = True, drop = True)

In [22]:
test_dt.head(5)

Unnamed: 0,image,diagnosis
0,ISIC_0034321,NV
1,ISIC_0034322,NV
2,ISIC_0034323,BCC
3,ISIC_0034324,NV
4,ISIC_0034325,NV


In [23]:
test_dt['diagnosis'].value_counts()

diagnosis
NV      2495
UNK     2047
MEL     1327
BCC      975
BKL      660
AK       374
SCC      165
VASC     104
DF        91
Name: count, dtype: int64

In [24]:
test_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8238 entries, 0 to 8237
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   image      8238 non-null   object
 1   diagnosis  8238 non-null   object
dtypes: object(2)
memory usage: 128.8+ KB


# merge metadata vs train groundtruth

In [37]:
df = train_GT_transformed.merge(train_df, on='image')
df.drop('lesion_id', inplace=True, axis = 1)
df.head()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,sex
0,ISIC_0000000,NV,55.0,anterior torso,female
1,ISIC_0000001,NV,30.0,anterior torso,female
2,ISIC_0000002,MEL,60.0,upper extremity,female
3,ISIC_0000003,NV,30.0,upper extremity,male
4,ISIC_0000004,MEL,80.0,posterior torso,male


In [38]:
test_df.head()

Unnamed: 0,image,age_approx,anatom_site_general,sex
0,ISIC_0034321,60.0,,female
1,ISIC_0034322,70.0,anterior torso,male
2,ISIC_0034323,70.0,lower extremity,male
3,ISIC_0034324,70.0,lower extremity,male
4,ISIC_0034325,30.0,upper extremity,female


In [7]:
df['anatom_site_general'].value_counts()

anatom_site_general
anterior torso     6915
lower extremity    4990
head/neck          4587
upper extremity    2910
posterior torso    2787
palms/soles         398
oral/genital         59
lateral torso        54
Name: count, dtype: int64

In [10]:
df.isna().sum()

image                     0
diagnosis                 0
age_approx              437
anatom_site_general    2631
sex                     384
dtype: int64

In [11]:
test_df.isna().sum()

image                    0
age_approx             326
anatom_site_general    655
sex                    339
dtype: int64

In [6]:
test_df['anatom_site_general'].value_counts()

anatom_site_general
anterior torso     2478
head/neck          1896
lower extremity    1778
upper extremity     789
posterior torso     312
palms/soles         290
oral/genital         40
Name: count, dtype: int64

In [13]:
print(df.describe())
print(test_df.describe())

         age_approx
count  24894.000000
mean      54.028481
std       18.130971
min        0.000000
25%       40.000000
50%       55.000000
75%       70.000000
max       85.000000
        age_approx
count  7912.000000
mean     57.331269
std      18.224414
min       0.000000
25%      45.000000
50%      60.000000
75%      75.000000
max      85.000000


# Fillna

In [39]:
df[['anatom_site_general', 'sex']] = df[['anatom_site_general', 'sex']].\
    fillna(value = 'unknown')
df['age_approx'] = df['age_approx'].fillna(value = 55)

test_df[['anatom_site_general', 'sex']] = test_df[['anatom_site_general', 'sex']].fillna(value = 'unknown')
test_df['age_approx'] = test_df['age_approx'].fillna(value = 60)

# Add imagepath

In [40]:
test_directory = '/kaggle/input/isic-2019-challenge/ISIC_2019_Test_Input/ISIC_2019_Test_Input'
train_directory = '/kaggle/input/isic-2019-challenge/ISIC_2019_Training_Input/ISIC_2019_Training_Input'
path_train = train_directory + '/'+ df['image'] + '.jpg'
path_test = test_directory + '/'+ test_df['image'] + '.jpg'

df['path_jpg'] = path_train
test_df['path_jpg'] = path_test

In [46]:
def encode_labels_train(df, column_name):
    label_encoder = LabelEncoder()
    
    df[column_name] = label_encoder.fit_transform(df[column_name])
    
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    
    return df, label_encoder, label_mapping  

def encode_labels_test(df, column_name, label_encoder):
    df[column_name] = df[column_name].map(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)  
    return df

In [47]:
col2 = ['sex', 'anatom_site_general']

label_encoders = {} 


for col in col2:
    df, label_enc, label_mapping = encode_labels_train(df, col)
    label_encoders[col] = label_enc  # Lưu encoder cho test_df
    print(f"{col}: {label_mapping}")


for col in col2:
    test_df = encode_labels_test(test_df, col, label_encoders[col])

sex: {'female': 0, 'male': 1, 'unknown': 2}
anatom_site_general: {'anterior torso': 0, 'head/neck': 1, 'lateral torso': 2, 'lower extremity': 3, 'oral/genital': 4, 'palms/soles': 5, 'posterior torso': 6, 'unknown': 7, 'upper extremity': 8}


In [48]:
col1 = ['diagnosis']
for col in col1:
    df, label_enc, label_mapping = encode_labels_train(df, col)
    label_encoders[col] = label_enc  
    print(f"{col}: {label_mapping}")

diagnosis: {'AK': 0, 'BCC': 1, 'BKL': 2, 'DF': 3, 'MEL': 4, 'NV': 5, 'SCC': 6, 'VASC': 7}


In [57]:
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

In [58]:
train_data.shape, val_data.shape, test_df.shape

((20264, 6), (5067, 6), (8238, 5))

In [59]:
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

# Remove all image with label Unkown in test dataset and test groundtruth

In [25]:
unk_images = test_dt[test_dt['diagnosis'] == 'UNK']['image']

In [26]:
# ground_thuth
test_dt = test_dt[test_dt['diagnosis'] != 'UNK']

In [29]:
# ground_truth
test_dt.shape

(6191, 2)

In [30]:
test_data = pd.read_csv('/kaggle/input/test-train/test_data.csv')
test_data.head()

Unnamed: 0,image,age_approx,anatom_site_general,sex,path_jpg
0,ISIC_0034321,60.0,7,0,/kaggle/input/isic-2019-challenge/ISIC_2019_Te...
1,ISIC_0034322,70.0,0,1,/kaggle/input/isic-2019-challenge/ISIC_2019_Te...
2,ISIC_0034323,70.0,3,1,/kaggle/input/isic-2019-challenge/ISIC_2019_Te...
3,ISIC_0034324,70.0,3,1,/kaggle/input/isic-2019-challenge/ISIC_2019_Te...
4,ISIC_0034325,30.0,8,0,/kaggle/input/isic-2019-challenge/ISIC_2019_Te...


In [31]:
test_data = test_data[~test_data['image'].isin(unk_images)]

In [32]:
test_data.shape

(6191, 5)

In [35]:
test_dt.to_csv('test_gt_remove_unk.csv', index=False)
test_data.to_csv('test_data_remove_unk.csv', index=False)