# DEEP LEARNING FOR SKIN LESIONS

This is our primary jupyter notebook for this project.

Imports:

In [2]:
import numpy as np
import pandas as pd


# import sklearn stuff
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

from PIL import Image


# set up for plotting figures in the notebook
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

#for image processing
from glob import glob

Phase 1: Load in the data


In [None]:
#NOTE: THIS DOWLOAD CODE IS FROM KAGGLEHUB WEBSITE

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "HAM10000_metadata.csv"

# Load the latest version
df = kagglehub.dataset_load( #changed from load_dataaset cause apparently it's going to be deprecated lol
  KaggleDatasetAdapter.PANDAS,
  "kmader/skin-cancer-mnist-ham10000",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

# split off 30% to be validation/test sets
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df['dx'], random_state=42)

# split this set (30% of original data) into halves, so 15% val, 15% test
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df['dx'], random_state=42)

# print(len(train_df), len(val_df), len(test_df))

First 5 records:      lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear
7010 3005
1502 1503


In [43]:
def one_hot_encode(data_frame):
    """
    one hot encoding steps for our dataset.
    
    data_frame: PD dataframe, data_frame_ that we would like to one hot encode
    
    """
    metadata_encoded = data_frame.join(pd.get_dummies(data_frame['dx'], prefix='dx', dtype=int))
    metadata_encoded = metadata_encoded.drop(columns=['dx'])

    metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['dx_type'], prefix='dx_type', dtype=int))
    metadata_encoded = metadata_encoded.drop(columns=['dx_type'])

    metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['sex'], prefix='sex', dtype=int))
    metadata_encoded = metadata_encoded.drop(columns=['sex'])

    metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['localization'], prefix='localization', dtype=int))
    metadata_encoded = metadata_encoded.drop(columns=['localization'])

    metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['dataset'], prefix='dataset', dtype=int))
    metadata_encoded = metadata_encoded.drop(columns=['dataset'])

    columns = ['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df',
        'dx_mel', 'dx_nv', 'dx_vasc', 'dx_type_confocal', 'dx_type_consensus',
        'dx_type_follow_up', 'dx_type_histo', 'age', 'sex_female', 'sex_male',
        'sex_unknown', 'localization_abdomen', 'localization_acral',
        'localization_back', 'localization_chest', 'localization_ear',
        'localization_face', 'localization_foot', 'localization_genital',
        'localization_hand', 'localization_lower extremity',
        'localization_neck', 'localization_scalp', 'localization_trunk',
        'localization_unknown', 'localization_upper extremity',
        'dataset_rosendahl', 'dataset_vidir_modern', 'dataset_vidir_molemax',
        'dataset_vienna_dias', 'lesion_id']

    for col in columns:
        if col not in metadata_encoded.columns:
            metadata_encoded[col] = 0

    return metadata_encoded

def apply_order(data_frame):
    """
    change the order of clumns in the dataframe, add new column names for the newly
    one hot encoded variables 

    data_frame: pandas data_frame, the df we would like to apply changes to 
    """
    index = ['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df',
        'dx_mel', 'dx_nv', 'dx_vasc', 'dx_type_confocal', 'dx_type_consensus',
        'dx_type_follow_up', 'dx_type_histo', 'age', 'sex_female', 'sex_male',
        'sex_unknown', 'localization_abdomen', 'localization_acral',
        'localization_back', 'localization_chest', 'localization_ear',
        'localization_face', 'localization_foot', 'localization_genital',
        'localization_hand', 'localization_lower extremity',
        'localization_neck', 'localization_scalp', 'localization_trunk',
        'localization_unknown', 'localization_upper extremity',
        'dataset_rosendahl', 'dataset_vidir_modern', 'dataset_vidir_molemax',
        'dataset_vienna_dias', 'lesion_id']

    data_frame = data_frame[index]
    data_frame = data_frame.drop(columns=['lesion_id'])
    return data_frame

In [49]:
metadata = pd.read_csv('HAM10000/HAM10000_metadata')
metadata.head(5)

print(metadata['dx'].unique())
print(metadata['dx_type'].unique())
print(metadata['sex'].unique())
print(metadata['localization'].unique())
print(metadata['dataset'].unique())


# split off 30% to be validation/test sets
train_df, temp_df = train_test_split(metadata, test_size=0.30, stratify=metadata['dx'], random_state=42)

# split this set (30% of original data) into halves, so 15% val, 15% test
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df['dx'], random_state=42)

['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
['histo' 'consensus' 'confocal' 'follow_up']
['male' 'female' 'unknown']
['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'unknown' 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']
['vidir_modern' 'rosendahl' 'vienna_dias' 'vidir_molemax']


In [50]:
#Processing the test set

train_df = one_hot_encode(train_df)
train_df = apply_order(train_df)
print(train_df.columns)
print(train_df.shape)
print(train_df.iloc[0])

Index(['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv',
       'dx_vasc', 'dx_type_confocal', 'dx_type_consensus', 'dx_type_follow_up',
       'dx_type_histo', 'age', 'sex_female', 'sex_male', 'sex_unknown',
       'localization_abdomen', 'localization_acral', 'localization_back',
       'localization_chest', 'localization_ear', 'localization_face',
       'localization_foot', 'localization_genital', 'localization_hand',
       'localization_lower extremity', 'localization_neck',
       'localization_scalp', 'localization_trunk', 'localization_unknown',
       'localization_upper extremity', 'dataset_rosendahl',
       'dataset_vidir_modern', 'dataset_vidir_molemax', 'dataset_vienna_dias'],
      dtype='object')
(7010, 35)
image_id                        ISIC_0031775
dx_akiec                                   0
dx_bcc                                     0
dx_bkl                                     0
dx_df                                      0
dx_mel             

In [51]:
#Processing the Validation set

val_df = one_hot_encode(val_df)
val_df = apply_order(val_df)
print(val_df.columns)
print(val_df.shape)
print(val_df.iloc[0])

Index(['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv',
       'dx_vasc', 'dx_type_confocal', 'dx_type_consensus', 'dx_type_follow_up',
       'dx_type_histo', 'age', 'sex_female', 'sex_male', 'sex_unknown',
       'localization_abdomen', 'localization_acral', 'localization_back',
       'localization_chest', 'localization_ear', 'localization_face',
       'localization_foot', 'localization_genital', 'localization_hand',
       'localization_lower extremity', 'localization_neck',
       'localization_scalp', 'localization_trunk', 'localization_unknown',
       'localization_upper extremity', 'dataset_rosendahl',
       'dataset_vidir_modern', 'dataset_vidir_molemax', 'dataset_vienna_dias'],
      dtype='object')
(1502, 35)
image_id                        ISIC_0032982
dx_akiec                                   0
dx_bcc                                     0
dx_bkl                                     0
dx_df                                      0
dx_mel             

In [52]:
#Processing the Test set

test_df = one_hot_encode(test_df)
test_df = apply_order(test_df)
print(test_df.columns)
print(test_df.shape)
print(test_df.iloc[0])

Index(['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv',
       'dx_vasc', 'dx_type_confocal', 'dx_type_consensus', 'dx_type_follow_up',
       'dx_type_histo', 'age', 'sex_female', 'sex_male', 'sex_unknown',
       'localization_abdomen', 'localization_acral', 'localization_back',
       'localization_chest', 'localization_ear', 'localization_face',
       'localization_foot', 'localization_genital', 'localization_hand',
       'localization_lower extremity', 'localization_neck',
       'localization_scalp', 'localization_trunk', 'localization_unknown',
       'localization_upper extremity', 'dataset_rosendahl',
       'dataset_vidir_modern', 'dataset_vidir_molemax', 'dataset_vienna_dias'],
      dtype='object')
(1503, 35)
image_id                        ISIC_0031580
dx_akiec                                   0
dx_bcc                                     0
dx_bkl                                     1
dx_df                                      0
dx_mel             

In [None]:
# labels = ['dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv', 'dx_vasc']
# #TRAIN SET: note that features are represnted by x, labels by y
# #labels 
# train_df_y = train_df[labels].copy()
# #features
# train_df_x = train_df.drop(columns=labels)

# #VALIDATION SET:
# #labels 
# val_df_y = val_df[labels].copy()
# #features
# val_df_x = val_df.drop(columns=labels)

# #TEST SET:
# #labels 
# test_df_y = test_df[labels].copy()
# #features
# test_df_x = test_df.drop(columns=labels)


In [None]:
# #sanity check
# print(train_df.shape)
# print(train_df_x.shape)
# print(train_df_y.shape)
# print()
# print(train_df_x.iloc[0])
# print()
# print(train_df_y.iloc[0])

(7010, 35)
(7010, 28)
(7010, 7)

image_id                        ISIC_0031775
dx_type_confocal                           0
dx_type_consensus                          0
dx_type_follow_up                          1
dx_type_histo                              0
age                                     60.0
sex_female                                 0
sex_male                                   1
sex_unknown                                0
localization_abdomen                       0
localization_acral                         0
localization_back                          0
localization_chest                         0
localization_ear                           0
localization_face                          0
localization_foot                          0
localization_genital                       0
localization_hand                          0
localization_lower extremity               0
localization_neck                          0
localization_scalp                         0
localization_trunk    

In [104]:
#Make our dataframe into indexible dicts so that we can easily match metadata to img
train_df_dict = train_df.set_index('image_id').to_dict(orient='index')
val_df_dict = val_df.set_index('image_id').to_dict(orient='index')
test_df_dict = test_df.set_index('image_id').to_dict(orient='index')

# train_df_dict_x = train_df_x.set_index('image_id').to_dict(orient='index')
# val_df_dict_x = val_df_x.set_index('image_id').to_dict(orient='index')
# test_df_dict_x = test_df_x.set_index('image_id').to_dict(orient='index')

In [109]:
#sanity check, this img is only in test_df.
print(len(test_df_dict['ISIC_0027419']))

#this should only be in train
print(len(train_df_dict['ISIC_0029306']))

#this should only be in train
print(len(train_df_dict['ISIC_0024306']))
print(train_df_dict['ISIC_0024306'])

34
34
34
{'dx_akiec': 0, 'dx_bcc': 0, 'dx_bkl': 0, 'dx_df': 0, 'dx_mel': 0, 'dx_nv': 1, 'dx_vasc': 0, 'dx_type_confocal': 0, 'dx_type_consensus': 0, 'dx_type_follow_up': 1, 'dx_type_histo': 0, 'age': 45.0, 'sex_female': 0, 'sex_male': 1, 'sex_unknown': 0, 'localization_abdomen': 0, 'localization_acral': 0, 'localization_back': 0, 'localization_chest': 0, 'localization_ear': 0, 'localization_face': 0, 'localization_foot': 0, 'localization_genital': 0, 'localization_hand': 0, 'localization_lower extremity': 0, 'localization_neck': 0, 'localization_scalp': 0, 'localization_trunk': 1, 'localization_unknown': 0, 'localization_upper extremity': 0, 'dataset_rosendahl': 0, 'dataset_vidir_modern': 0, 'dataset_vidir_molemax': 1, 'dataset_vienna_dias': 0}


Image Processing + Input to Model

In [129]:
#get the path for each image
#ham10000_pt1 has the paths for all images in the ham10000_pt1 folder, same for pt2.
ham10000_pt1 = glob('HAM10000/HAM10000_part1/*.jpg')
ham1000_pt2 = glob('HAM10000/HAM10000_part2/*.jpg')

print(ham10000_pt1[0])
print(ham1000_pt2[0])

combined_folder_paths = ham10000_pt1 + ham1000_pt2
print(combined_folder_paths[0])
print(combined_folder_paths[len(ham10000_pt1)])


HAM10000/HAM10000_part1\ISIC_0024306.jpg
HAM10000/HAM10000_part2\ISIC_0029306.jpg
HAM10000/HAM10000_part1\ISIC_0024306.jpg
HAM10000/HAM10000_part2\ISIC_0029306.jpg


In [123]:
#make the metadata a pandas dataframe

def print_data(image_id, row, flattened_image_np, total_data, i):
    """ 
    Function that prints the data so that differences can be spotted.
    intended to be used inside of process data for testing.
    image_id: string, the image id. (eg ISIC_0024306)
    row: list, metadat info
    flattened_image_np: np array, flattened image info
    total_data: pandas df, the master dataframe
    i: int, iterator

    """
    print("----------------------------")
    print()
    print("image_id: ", image_id)
    print("first 10 of the metadata: ", row[:10])
    print("last 10 entries of the flattened_image_np array: ", flattened_image_np[-10:])
    print("total data row that was just created: ", total_data.iloc[i].to_frame().T)
    print()

def add_to_list(image_id, dict_frame, flattened_image_np, final_res):
    """ 
    This is a function to help add the pixel array pertaining to image_id to the specified list

    image_id: string, the id of the image
    dict_frame: a pandas data frame, the one that we search from
    flattened_image_np: numpy array, pixelized flattened version of the image
    final_res: an array, the one that we want to be appending to 
    """
    #print("IN ADD_TO_DF")
    metadata_row = dict_frame[image_id] 
    #print(metadata_row)
    #row data needs to be changed to np array of alues cause we made it key alue pairs
    metadata_row_np = np.array(list(metadata_row.values()))

    obs = np.concatenate((metadata_row_np, flattened_image_np))
    # obs_df = pd.DataFrame([obs])
    # final_res = pd.concat([final_res, obs_df], ignore_index=True)
    final_res.append(obs)
    
def process_data(folder, test_df_dict, val_df_dict, train_df_dict):
    """
    function that processes the data from HAM10000

    folder: a list, holds the names of all the paths in the folder
    test_df_dict_x: pd dataframe, converted to a dict, so that we can index by img id
    train_df_dict_x: pd dataframe, converted to a dict, so that we can index by img id
    val_df_dict_x: pd dataframe, converted to a dict, so that we can index by img id

    """
    # test_ISIC_ids = set(test_df_x.keys)
    # val_ISIC_ids = set(val_df_x.keys)
    # print("ISIC_0024306" in test_ISIC_ids)
    # print("ISIC_0024306" in val_ISIC_ids)
    #train_ISIC_ids = set(train_df_x["image_id"])
    train_final = [] 
    test_final = [] 
    val_final = []
    for i in range(len(folder)):
        #open the image
        path = folder[i]
        image = Image.open(path)

        image_id = path[24:len(path)-4]

        #resize it, this will keep the aspect ratio
        image.thumbnail((100, 100))

        #make our image into a series of pixes, MxNx3
        img_array = np.array(image) 
        #flatten the 3D matrix
        flattened_array = img_array.flatten()
        flattened_image_np = np.array(flattened_array) / 255.0

        #something like
        #if imag_id in val_df_x then row = test_df_x[image_id]
        #else if image_id in train_df_x then row = train_df_x[image_id]
        #else we know its in train_df_x so row = train_df_x[image_id]
        #whih is probs better cause test set is gonna be huge, so we 
        #don't wanna be looking through that thing
        if image_id in test_df_dict:
            add_to_list(image_id,test_df_dict, flattened_image_np, test_final)
        elif image_id in val_df_dict:
            add_to_list(image_id,val_df_dict, flattened_image_np, val_final)
        else:
            add_to_list(image_id, train_df_dict, flattened_image_np, train_final)

        if(i%1000 == 0) and i != 0:
            #print_data(image_id, row, flattened_image_np, data_frame, i)
            print("!! finished batch of 1000 !!")
            print("test final len: ", len(test_final))
            print("some output of final_en: ", test_final[-1])
            print("val_final shape: ", len(val_final))
            print("some output of val_final: ", val_final[-1])
            print("train_final shape", len(train_final))
            print("some output of train_final: ", train_final[-1])
    
    return (pd.DataFrame(train_final), pd.DataFrame(val_final), pd.DataFrame(test_final))


In [132]:
#Appending all of the pixels to the metadata sets!!! RAAHHH
train_final, val_final, test_final = process_data(combined_folder_paths, test_df_dict, val_df_dict, train_df_dict)

!! finished batch of 1000 !!
test final len:  158
some output of final_en:  [0.         0.         0.         ... 0.84705882 0.58431373 0.57647059]
val_final shape:  153
some output of val_final:  [0.         0.         0.         ... 0.81176471 0.55686275 0.53333333]
train_final shape 690
some output of train_final:  [0.         0.         0.         ... 0.88627451 0.6        0.64313725]
!! finished batch of 1000 !!
test final len:  310
some output of final_en:  [0.         0.         0.         ... 0.7372549  0.39607843 0.3372549 ]
val_final shape:  306
some output of val_final:  [0.         0.         0.         ... 0.78039216 0.56862745 0.53333333]
train_final shape 1385
some output of train_final:  [0.         0.         0.         ... 0.82352941 0.49803922 0.49019608]
!! finished batch of 1000 !!
test final len:  483
some output of final_en:  [0.         0.         0.         ... 0.83921569 0.6        0.49803922]
val_final shape:  448
some output of val_final:  [0.         0.    

In [133]:
#sanity check
print(train_final.shape)
print(test_final.shape)
print(val_final.shape)

(7010, 22534)
(1503, 22534)
(1502, 22534)


In [134]:
#since we iterated in order of the image ids we need to shuffle each set again
shuffled_train_final = train_final.sample(frac=1).reset_index(drop=True)
shuffled_test_final = test_final.sample(frac=1).reset_index(drop=True)
shuffled_val_final = val_final.sample(frac=1).reset_index(drop=True)

In [164]:
print(shuffled_test_final.columns)
# add in the metadata column names back in
index_names = ['dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df',
    'dx_mel', 'dx_nv', 'dx_vasc', 'dx_type_confocal', 'dx_type_consensus',
    'dx_type_follow_up', 'dx_type_histo', 'age', 'sex_female', 'sex_male',
    'sex_unknown', 'localization_abdomen', 'localization_acral',
    'localization_back', 'localization_chest', 'localization_ear',
    'localization_face', 'localization_foot', 'localization_genital',
    'localization_hand', 'localization_lower extremity',
    'localization_neck', 'localization_scalp', 'localization_trunk',
    'localization_unknown', 'localization_upper extremity',
    'dataset_rosendahl', 'dataset_vidir_modern', 'dataset_vidir_molemax',
    'dataset_vienna_dias']
print(shuffled_train_final.shape[1] - len(index_names))

#for pixels just i 
for i in range(len(index_names), shuffled_train_final.shape[1]):
    index_names.append(i)

shuffled_train_final.columns = index_names
shuffled_test_final.columns = index_names
shuffled_val_final.columns = index_names

#print(shuffled_val_final)

Index([         'dx_akiec',            'dx_bcc',            'dx_bkl',
                   'dx_df',            'dx_mel',             'dx_nv',
                 'dx_vasc',  'dx_type_confocal', 'dx_type_consensus',
       'dx_type_follow_up',
       ...
                     22524,               22525,               22526,
                     22527,               22528,               22529,
                     22530,               22531,               22532,
                     22533],
      dtype='object', length=22534)
22500


In [165]:

labels = ['dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv', 'dx_vasc']
#Now that all of the metadata is sorted out, pop off the labels

#TRAIN SET: note that features are represnted by x, labels by y
#labels 
shuffled_train_final_y = shuffled_train_final[labels].copy()
    #train_df_y = train_df[labels].copy()
#features
shuffled_train_final_x = shuffled_train_final.drop(columns=labels)
    #train_df_x = train_df.drop(columns=labels)

#VALIDATION SET:
#labels 
shuffled_val_final_y = shuffled_val_final[labels].copy()
    #val_df_y = val_df[labels].copy()
#features
shuffled_val_final_x = shuffled_val_final.drop(columns=labels)
    #val_df_x = val_df.drop(columns=labels)

#TEST SET:
#labels 
shuffled_test_final_y = shuffled_test_final[labels].copy()
    #test_df_y = test_df[labels].copy()
#features
shuffled_test_final_x = shuffled_test_final.drop(columns=labels)
    #test_df_x = test_df.drop(columns=labels)


In [166]:
#RAHHHHHHHHHHHHHHHHHH PUT IT IN A CSVVVV
shuffled_train_final_y.to_csv('shuffled_train_final_y.csv', index=False)
shuffled_train_final_x.to_csv('shuffled_train_final_x.csv', index=False)

shuffled_val_final_y.to_csv('shuffled_val_final_y.csv', index=False)
shuffled_val_final_x.to_csv('shuffled_val_final_x.csv', index=False)

shuffled_test_final_y.to_csv('shuffled_test_final_y.csv', index=False)
shuffled_test_final_x.to_csv('shuffled_test_final_x.csv', index=False)