In [2]:
import numpy as np
import pandas as pd

import os
import os.path
import shutil
from glob import glob
from PIL import Image

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [3]:
# Define root and base directories
root_dir = '/Users/leaf/SpringBoard/Capstone'
base_skin_dir = root_dir + '/data/Kaggle_HAM10000'

# Build a dictionary with image id and path
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

# Create a dictionary of abbreviation definitions
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

# Create a dictionary of index codes
lesion_index_dict = {
    'nv': 0,
    'mel': 1,
    'bkl': 2,
    'bcc': 3,
    'akiec': 4,
    'vasc': 5,
    'df': 6
}

In [4]:
# Build dataframe
tile_df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = tile_df['dx'].map(lesion_index_dict.get)
tile_df.sample(3)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
2470,HAM_0006834,ISIC_0027526,bcc,histo,85.0,male,face,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Basal cell carcinoma,3
5260,HAM_0001267,ISIC_0030454,nv,follow_up,45.0,male,lower extremity,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Melanocytic nevi,0
6379,HAM_0000620,ISIC_0027414,nv,follow_up,30.0,female,trunk,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Melanocytic nevi,0


In [5]:
# Load and resize images
tile_df['image'] = tile_df['path'].map(lambda x: np.asarray(Image.open(x)))

In [6]:
tile_df['cell_type'].value_counts()

Melanocytic nevi                  6705
Melanoma                          1113
Benign keratosis-like lesions     1099
Basal cell carcinoma               514
Actinic keratoses                  327
Vascular lesions                   142
Dermatofibroma                     115
Name: cell_type, dtype: int64

In [7]:
# First we group by lesion id to see how many images per lesion we have
duplicates_df = tile_df.iloc[:,:2].groupby('lesion_id').count()

# Then, we filter for all lesions that only have one associated image.
# Thus, we can be assured that the same lesion will not be found in the 
# training as well as the test images. 

duplicates_df = duplicates_df[duplicates_df['image_id'] == 1]
duplicates_df.reset_index(inplace=True)
duplicates_df.head()

Unnamed: 0,lesion_id,image_id
0,HAM_0000001,1
1,HAM_0000003,1
2,HAM_0000004,1
3,HAM_0000007,1
4,HAM_0000008,1


In [8]:
# Prepare a function to identify lesions with no duplicate images
def identify_duplicates(x):
    
    unique_list = list(duplicates_df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# Add a column to our dataframe that will mark images with duplicate pairs
tile_df['duplicates'] = tile_df['lesion_id']

# Use our function to map lesion ID to a 'duplicate' marker in the main dataframe
tile_df['duplicates'] = tile_df['duplicates'].apply(identify_duplicates)

tile_df.head(3)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,image,duplicates
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[188, 147, 191], [186, 148, 189], [187, 150,...",has_duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[25, 15, 23], [25, 14, 22], [25, 14, 22], [2...",has_duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[186, 128, 140], [188, 128, 136], [183, 126,...",has_duplicates


In [9]:
tile_df['duplicates'].value_counts()

no_duplicates     5514
has_duplicates    4501
Name: duplicates, dtype: int64

In [10]:
# We will use images with no duplicates to build our test and validation sets
no_duplicates_df = tile_df[tile_df['duplicates'] == 'no_duplicates']

no_duplicates_df.shape

(5514, 12)

In [11]:
# Now we select a portion of our 'no_duplicate' images to create our test and
# validation sets. Note that test_size = .55 so that we end up with ~3k images
# which represents about 30% of our full dataset of ~10k images. 
y = no_duplicates_df['cell_type_idx']
_, test_val_df = train_test_split(no_duplicates_df, 
                                    test_size = .55, 
                                    stratify=y, 
                                    random_state=108) 

In [12]:
test_val_df.shape

(3033, 12)

In [13]:
# Now we choose 20% of our test images to create our validation set 
y = test_val_df['cell_type_idx']
test_df, valid_df = train_test_split(test_val_df, 
                                       test_size = .2, 
                                       stratify=y, 
                                       random_state=108) 

In [14]:
# Now that we have our test and validation images set up, let's
# construct a training dataframe that contains the remaining images

# Define a function to identify if an image is in our test or 
# validation sets. 
def identify_val_rows(x):
    # create a list of all the lesion_id's in the val set
    test_list = list(test_df['image_id'])
    val_list = list(valid_df['image_id'])
    
    if str(x) in test_list or str(x) in val_list:
        return 'test_val'
    else:
        return 'train'

# Create a new colum that is a copy of the image_id column
tile_df['train_or_val'] = tile_df['image_id']
# Apply our function to this new column to create a label for images
# in test or validation sets
tile_df['train_or_val'] = tile_df['train_or_val'].apply(identify_val_rows)
   
# Finally, we create a training dataframe with the appropriate images
train_df = tile_df[tile_df['train_or_val'] == 'train']

In [15]:
print(f"Training image count: {train_df.shape[0]}")
print(f"Test image count: {test_df.shape[0]}")
print(f"Validation image count: {valid_df.shape[0]}")

Training image count: 6982
Test image count: 2426
Validation image count: 607


In [16]:
train_df['dx'].value_counts()

nv       4276
mel       986
bkl       857
bcc       418
akiec     244
vasc      107
df         94
Name: dx, dtype: int64

In [17]:
test_df['dx'].value_counts()

nv       1943
bkl       193
mel       102
bcc        77
akiec      66
vasc       28
df         17
Name: dx, dtype: int64

In [18]:
valid_df['dx'].value_counts()

nv       486
bkl       49
mel       25
bcc       19
akiec     17
vasc       7
df         4
Name: dx, dtype: int64

In [19]:
# We can see that our train, test and validation sets add up to 10,015 images. 
# The train/test/valid split is (.7, .24, .6)

In [20]:
####################### PROCEED TO TRAINING SET AUGMENTATION #######################

In [21]:
# Next we will be augmenting our training data. This will serve a dual purpose: 
# to increase our sample size, and to balance our dataset. Presently, our data is 
# significantly imbalanced. Based on the representation of each class in our 
# training set, we will selectively augment the data such that the total 
# representation of each class will become approximately equivalent. 

train_df['dx'].value_counts()

nv       4276
mel       986
bkl       857
bcc       418
akiec     244
vasc      107
df         94
Name: dx, dtype: int64

In [22]:
# Let's create a dataframe that will tell us how many augmentation images
# we need for eaching training sample to appropriately balance our data. 
# To begin, we create a dataframe with each class name and it's associated 
# index (sorted by representation in our training dataset).
aug_df = train_df.iloc[:,8:10].drop_duplicates().\
                sort_values('cell_type_idx').reset_index().drop('index', axis=1)
aug_df

Unnamed: 0,cell_type,cell_type_idx
0,Melanocytic nevi,0
1,Melanoma,1
2,Benign keratosis-like lesions,2
3,Basal cell carcinoma,3
4,Actinic keratoses,4
5,Vascular lesions,5
6,Dermatofibroma,6


In [23]:
# Append an 'aug_multiplier' to aug_df that tells us how many augmented images to generate
# for each training sample of a given class.

target_image_count = len(train_df[train_df['dx'] == 'nv']) # 4276 - the number of images in our largest class

aug_df['aug_multiplier'] = [target_image_count//i for i in tile_df['cell_type'].value_counts()]
aug_df

Unnamed: 0,cell_type,cell_type_idx,aug_multiplier
0,Melanocytic nevi,0,0
1,Melanoma,1,3
2,Benign keratosis-like lesions,2,3
3,Basal cell carcinoma,3,8
4,Actinic keratoses,4,13
5,Vascular lesions,5,30
6,Dermatofibroma,6,37


In [24]:
# Create a dictionary with keys = lesion_index, values = aumentation_multiplier

temp_dict = aug_df.drop(['cell_type', 'cell_type_idx'], axis=1).to_dict()
aug_dict = temp_dict.get('aug_multiplier')
aug_dict

{0: 0, 1: 3, 2: 3, 3: 8, 4: 13, 5: 30, 6: 37}

In [25]:
# Create a new column in each dataframe that has our augmentation value for each sample

tile_df['aug_multiplier'] = tile_df['cell_type_idx'].map(lambda x: aug_dict.get(x))
train_df['aug_multiplier'] = train_df['cell_type_idx'].map(lambda x: aug_dict.get(x))

In [26]:
tile_df.head(3)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,image,duplicates,train_or_val,aug_multiplier
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[188, 147, 191], [186, 148, 189], [187, 150,...",has_duplicates,train,3
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[25, 15, 23], [25, 14, 22], [25, 14, 22], [2...",has_duplicates,train,3
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/Users/leaf/SpringBoard/Capstone/data/Kaggle_H...,Benign keratosis-like lesions,2,"[[[186, 128, 140], [188, 128, 136], [183, 126,...",has_duplicates,train,3


In [27]:
# The balanced directory will contain all of our balanced data after augmentation
balanced_dir = os.path.join(root_dir, 'data', 'balanced_dir')
os.mkdir(balanced_dir)

train_dir = os.path.join(balanced_dir, 'train')
os.mkdir(train_dir)
test_dir = os.path.join(balanced_dir, 'test')
os.mkdir(test_dir)
valid_dir = os.path.join(balanced_dir, 'valid')
os.mkdir(valid_dir)


# Initiate folders by class within train, test and valid sets
def create_class_directories(path):
    nv = os.path.join(path, 'nv')
    os.mkdir(nv)
    mel = os.path.join(path, 'mel')
    os.mkdir(mel)
    bkl = os.path.join(path, 'bkl')
    os.mkdir(bkl)
    bcc = os.path.join(path, 'bcc')
    os.mkdir(bcc)
    akiec = os.path.join(path, 'akiec')
    os.mkdir(akiec)
    vasc = os.path.join(path, 'vasc')
    os.mkdir(vasc)
    df = os.path.join(path, 'df')
    os.mkdir(df)
    
create_class_directories(os.path.join(train_dir))
create_class_directories(os.path.join(test_dir))
create_class_directories(os.path.join(valid_dir))

In [28]:
# Initialize the data generator we will use to create our augmented data

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

datagen = ImageDataGenerator(
        rescale=None,
        rotation_range=30,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        featurewise_center=False,  
        samplewise_center=False,
        featurewise_std_normalization=False,
        samplewise_std_normalization=False,
        fill_mode='reflect')

In [29]:
# This loop will generate n augmentation images where n is our aug_multiplier.  Once these files have been generated, 
# we will have a relatively balanced dataset. 

for i, row in train_df.iterrows():
    img = load_img(row['path'])  # this is a PIL image
    x = img_to_array(img)  # this is a Numpy array with shape (600, 450, 3)
    x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 600, 450, 3)

    counter = 0
    if row['aug_multiplier'] != 0:  # Skip augmentation for class 'nv'
        for batch in datagen.flow(x, batch_size = 1,
                                    save_to_dir = os.path.join(balanced_dir, 'train', row['dx']),
                                    save_prefix = row['dx'], save_format = 'jpeg'):
            counter += 1
            if counter > row['aug_multiplier']:
                break  

In [30]:
# Now that we have uploaded our augmentation images, we will add the original images to 
# their appropriate directories. Once that is complete, we will have a balanced training set
# and test/validation sets that contain no similar images in the training group. 

In [31]:
# Since there is significant augmentation of the training images, the numbers are higher 
# in this category. 
def display_image_count():
    for folder in os.listdir(balanced_dir):
        if folder == '.ipynb_checkpoints':
            pass
        else:
            print('#'* 60)
            folder_total = 0
            for path_ext in lesion_index_dict.keys():
                folder_total += len(os.listdir(os.path.join(balanced_dir, folder, path_ext)))
                print(f'Directory: "{folder}/{path_ext}" contains\
                      {len(os.listdir(os.path.join(balanced_dir, folder, path_ext)))} images')
            print("\n", " " * 35, f"Total {folder} images: {folder_total}")

In [32]:
# These are all of our augmented images
display_image_count()

############################################################
Directory: "test/nv" contains                      0 images
Directory: "test/mel" contains                      0 images
Directory: "test/bkl" contains                      0 images
Directory: "test/bcc" contains                      0 images
Directory: "test/akiec" contains                      0 images
Directory: "test/vasc" contains                      0 images
Directory: "test/df" contains                      0 images

                                     Total test images: 0
############################################################
Directory: "train/nv" contains                      0 images
Directory: "train/mel" contains                      3252 images
Directory: "train/bkl" contains                      2882 images
Directory: "train/bcc" contains                      3101 images
Directory: "train/akiec" contains                      2919 images
Directory: "train/vasc" contains                      2821 images
Di

In [33]:
# The code in this cell is adapted from:
# https://www.kaggle.com/vbookshelf/skin-lesion-analyzer-tensorflow-js-web-app

# Set the image_id as the index in a temporary dataframe
transfer_df = tile_df.set_index('image_id')

# Get a list of images in each of the two folders
folder_1 = os.listdir('../data/Kaggle_HAM10000/ham10000_images_part_1')
folder_2 = os.listdir('../data/Kaggle_HAM10000/ham10000_images_part_2')

# Get a list of train and val images
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])
valid_list = list(valid_df['image_id'])

# Transfer training images
for image in train_list:
    
    fname = image + '.jpg'
    label = transfer_df.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)
        
# Transfer test images

for image in test_list:
    
    fname = image + '.jpg'
    label = transfer_df.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(test_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(test_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

# Transfer the validation images

for image in valid_list:
    
    fname = image + '.jpg'
    label = transfer_df.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(valid_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('../data/Kaggle_HAM10000/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(valid_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [34]:
# This is our complete dataset with the original images added
display_image_count()

############################################################
Directory: "test/nv" contains                      1943 images
Directory: "test/mel" contains                      102 images
Directory: "test/bkl" contains                      193 images
Directory: "test/bcc" contains                      77 images
Directory: "test/akiec" contains                      66 images
Directory: "test/vasc" contains                      28 images
Directory: "test/df" contains                      17 images

                                     Total test images: 2426
############################################################
Directory: "train/nv" contains                      4276 images
Directory: "train/mel" contains                      4238 images
Directory: "train/bkl" contains                      3739 images
Directory: "train/bcc" contains                      3519 images
Directory: "train/akiec" contains                      3163 images
Directory: "train/vasc" contains                   