# DEEP LEARNING FOR SKIN LESIONS

This is our primary jupyter notebook for this project.

Imports:

In [5]:
import numpy as np
import pandas as pd


# import sklearn stuff
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

from PIL import Image


# set up for plotting figures in the notebook
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

#for image processing
from glob import glob

Phase 1: Load in the data


In [None]:
#NOTE: THIS DOWLOAD CODE IS FROM KAGGLEHUB WEBSITE

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "HAM10000_metadata.csv"

# Load the latest version
df = kagglehub.dataset_load( #changed from load_dataaset cause apparently it's going to be deprecated lol
  KaggleDatasetAdapter.PANDAS,
  "kmader/skin-cancer-mnist-ham10000",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

# split off 30% to be validation/test sets
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df['dx'], random_state=42)

# split this set (30% of original data) into halves, so 15% val, 15% test
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df['dx'], random_state=42)

# print(len(train_df), len(val_df), len(test_df))

First 5 records:      lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear
7010 3005
1502 1503


In [36]:
metadata = pd.read_csv('HAM10000/HAM10000_metadata')
metadata.head(5)

print(metadata['dx'].unique())
print(metadata['dx_type'].unique())
print(metadata['sex'].unique())
print(metadata['localization'].unique())
print(metadata['dataset'].unique())

metadata_encoded = metadata.join(pd.get_dummies(metadata['dx'], prefix='dx', dtype=int))
metadata_encoded = metadata_encoded.drop(columns=['dx'])

metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['dx_type'], prefix='dx_type', dtype=int))
metadata_encoded = metadata_encoded.drop(columns=['dx_type'])

metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['sex'], prefix='sex', dtype=int))
metadata_encoded = metadata_encoded.drop(columns=['sex'])

metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['localization'], prefix='localization', dtype=int))
metadata_encoded = metadata_encoded.drop(columns=['localization'])

metadata_encoded = metadata_encoded.join(pd.get_dummies(metadata_encoded['dataset'], prefix='dataset', dtype=int))
metadata_encoded = metadata_encoded.drop(columns=['dataset'])

new_order = ['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
['histo' 'consensus' 'confocal' 'follow_up']
['male' 'female' 'unknown']
['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'unknown' 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']
['vidir_modern' 'rosendahl' 'vienna_dias' 'vidir_molemax']
index = ['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df',
       'dx_mel', 'dx_nv', 'dx_vasc', 'dx_type_confocal', 'dx_type_consensus',
       'dx_type_follow_up', 'dx_type_histo', 'age', 'sex_female', 'sex_male',
       'sex_unknown', 'localization_abdomen', 'localization_acral',
       'localization_back', 'localization_chest', 'localization_ear',
       'localization_face', 'localization_foot', 'localization_genital',
       'localization_hand', 'localization_lower extremity',
       'localization_neck', 'localization_scalp', 'localization_trunk',
       'localization_unknown', 'localization_upper extremity',
       'dataset_rosendahl', 'dataset_vidir_modern', 'dataset_vidir_molemax',
       'dataset_vienna_dias', 'lesion_id']

metadata_encoded = metadata_encoded[index]
metadata_encoded = metadata_encoded.drop(columns=['lesion_id'])
print(metadata_encoded.columns)

['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
['histo' 'consensus' 'confocal' 'follow_up']
['male' 'female' 'unknown']
['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'unknown' 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']
['vidir_modern' 'rosendahl' 'vienna_dias' 'vidir_molemax']
Index(['image_id', 'dx_akiec', 'dx_bcc', 'dx_bkl', 'dx_df', 'dx_mel', 'dx_nv',
       'dx_vasc', 'dx_type_confocal', 'dx_type_consensus', 'dx_type_follow_up',
       'dx_type_histo', 'age', 'sex_female', 'sex_male', 'sex_unknown',
       'localization_abdomen', 'localization_acral', 'localization_back',
       'localization_chest', 'localization_ear', 'localization_face',
       'localization_foot', 'localization_genital', 'localization_hand',
       'localization_lower extremity', 'localization_neck',
       'localization_scalp', 'localization_trunk', 'localization_unknown',
       'localization_upper extremity', 'dataset_rosendahl',
       'dataset_vidir_modern', 'datase

In [None]:
metadata_encoded = metadata_encoded.set_index('image_id').T.to_dict('list')

# print(len(metadata_encoded['ISIC_0027419']))
# print(len(index))

34
36


Image Processing + Input to Model

In [None]:
#get the path for each image
#ham10000_pt1 has the paths for all images in the ham10000_pt1 folder, same for pt2.
ham10000_pt1 = glob('HAM10000/HAM10000_part1/*.jpg')
ham1000_pt2 = glob('HAM10000/HAM10000_part2/*.jpg')

print(ham10000_pt1[0])
print(ham1000_pt2[0])

HAM10000/HAM10000_part1\ISIC_0024306.jpg
HAM10000/HAM10000_part2\ISIC_0029306.jpg


In [None]:
#make the metadata a pandas dataframe



In [44]:
#DATA PROCESSING FOR GENERAL CLASSIFIERS

#loop oer each image
    #resize 
    #flatten the 3d array
    #each row create a mapping
    #process the metadata(ex: mapping labels to numbers)
    #add all of the info to the accum numpy array 
#

pixel_rep = []

total_data = pd.DataFrame()

for path in ham10000_pt1:
    #open the image
    image = Image.open(path)

    image_id = path[25:len(path)]

    #resize it, this will keep the aspect ratio
    image = image.thumbnail((100, 100))

    #image.save('image_thumbnail.jpg')
    #print(image.size) # Output: (100, 100)

    #make our image into a series of pixes, MxNx3
    image_to_pixels = plt.imread(image)

    #flatten the 3D matrix
    flattened_array = np.flatten(image_to_pixels)
    print(flattened_array)
    break

    #process all of the metadata
    

    #need to add all of the metadata accosiated with this image
    # obseration = metadata[image_id]
    # np.insert(flattened_array,0, obseration)
    #pixel_rep.append(plt.imread(image))

# for path in ham1000_pt2:
    #pixel_rep.append(plt.imread(path))


#column_names = ["dx","dx_type","age","sex","localization","dataset", "pixel0", "pixel1", etc]



# print("-----Results of reading in the image with imread-----")
# print("First item in the array:")
# print(pixel_rep[0])

# print(img_test)
# print(ham1000_pt2[0])
# print(ham10000_pt1[0])

AttributeError: 'NoneType' object has no attribute 'read'

In [None]:
print(len(pixel_rep))

#pd.DataFrame(pixel_rep, columns=['image'])

10015


ValueError: Must pass 2-d input. shape=(10015, 450, 600, 3)

In [None]:
#diide each RGB by 255 to get numbers in range 0 - 1