## Defining Dataset And Its Classes 

In [1]:
from numpy.random import seed
seed(101)    # so that numpy generates same random number and save the randomness function every time
from tensorflow.random import set_seed
set_seed(101)

import numpy as np
import pandas as pd

import os
import itertools
import shutil

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN



from tensorflow.keras.preprocessing.image import ImageDataGenerator



**LABELS**<br>

Excerpts from the paper:<br>
> The HAM10000 Dataset: A Large Collection of Multi-Source Dermatoscopic Images of Common Pigmented Skin Lesions<br>
https://arxiv.org/abs/1803.10417



 **nv**<br>
 Melanocytic nevi are benign neoplasms of melanocytes and appear in a myriad of variants, which all are included in our series. The variants may differ significantly from a dermatoscopic point of view.<br>
 *[6705 images]*
 
 **mel**<br>
 Melanoma is a malignant neoplasm derived from melanocytes that may appear in different variants. If excised in an early stage it can be cured by simple surgical excision. Melanomas can be invasive or non-invasive (in situ). We included all variants of melanoma including melanoma in situ, but did exclude non-pigmented, subungual, ocular or mucosal melanoma.<br>*[1113 images]*
 
 
**bkl**<br>
 "Benign keratosis" is a generic class that includes seborrheic ker- atoses ("senile wart"), solar lentigo - which can be regarded a flat variant of seborrheic keratosis - and lichen-planus like keratoses (LPLK), which corresponds to a seborrheic keratosis or a solar lentigo with inflammation
and regression [22]. The three subgroups may look different dermatoscop- ically, but we grouped them together because they are similar biologically and often reported under the same generic term histopathologically. From a dermatoscopic view, lichen planus-like keratoses are especially challeng- ing because they can show morphologic features mimicking melanoma [23] and are often biopsied or excised for diagnostic reasons.<br>
*[1099 images]*

**bcc**<br>
Basal cell carcinoma is a common variant of epithelial skin cancer that rarely metastasizes but grows destructively if untreated. It appears in different morphologic variants (flat, nodular, pigmented, cystic, etc) [21], which are all included in this set.<br>
*[514 images]*
 
**akiec**<br>
Actinic Keratoses (Solar Keratoses) and intraepithelial Carcinoma (Bowen’s disease) are common non-invasive, variants of squamous cell car- cinoma that can be treated locally without surgery. Some authors regard them as precursors of squamous cell carcinomas and not as actual carci- nomas. There is, however, agreement that these lesions may progress to invasive squamous cell carcinoma - which is usually not pigmented. Both neoplasms commonly show surface scaling and commonly are devoid of pigment. Actinic keratoses are more common on the face and Bowen’s disease is more common on other body sites. Because both types are in- duced by UV-light the surrounding skin is usually typified by severe sun damaged except in cases of Bowen’s disease that are caused by human papilloma virus infection and not by UV. Pigmented variants exists for Bowen’s disease [19] and for actinic keratoses [20]. Both are included in this set.<br>*[327 images]*


**vasc**<br>
Vascular skin lesions in the dataset range from cherry angiomas to angiokeratomas [25] and pyogenic granulomas [26]. Hemorrhage is also included in this category.<br>
*[142 images]*

**df**<br>
Dermatofibroma is a benign skin lesion regarded as either a benign proliferation or an inflammatory reaction to minimal trauma. It is brown often showing a central zone of fibrosis dermatoscopically [24].<br>*[115 images]*


<br>*[Total images = 10015]*

# Create the directory structure

In these folders we will store the images that will later be fed to the Keras generators. 

In [3]:


# Create a new directory
base_dir = 'Data_Images'
os.mkdir(base_dir)


#[CREATE FOLDERS INSIDE THE BASE DIRECTORY]

# now we create 7 folders inside 'base_dir':

# train_dir
    # nv
    # mel
    # bkl
    # bcc
    # akiec
    # vasc
    # df
 
# val_dir
    # nv
    # mel
    # bkl
    # bcc
    # akiec
    # vasc
    # df

# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)


# [CREATE FOLDERS INSIDE THE TRAIN, VALIDATION AND TEST FOLDERS]
# Inside each folder we create seperate folders for each class

# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)



# create new folders inside val_dir
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

### Create Train and Val Sets

In [4]:
df_data = pd.read_csv('input/HAM10000_metadata.csv')
print(df_data.shape)
df_data.head()

(10015, 7)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [5]:
df_data['dx'].value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

### Create a stratified val set

In [6]:
# this will tell us how many images are associated with each lesion_id
df = df_data.groupby('lesion_id').count()

# now we filter out lesion_id's that have only one image associated with it
df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

print(df.shape)
df.head()

(5514, 7)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000001,1,1,1,1,1,1
1,HAM_0000003,1,1,1,1,1,1
2,HAM_0000004,1,1,1,1,1,1
3,HAM_0000007,1,1,1,1,1,1
4,HAM_0000008,1,1,1,1,1,1


In [7]:
# here we identify lesion_id's that have duplicate images and those that have only
# one image.

def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# create a new colum that is a copy of the lesion_id column
df_data['duplicates'] = df_data['lesion_id']
# apply the function to this new column
df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)

print(df_data.shape)
df_data.head()

(10015, 8)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicates
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,has_duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,has_duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,has_duplicates
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,has_duplicates
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,has_duplicates


In [8]:
df_data.dx.value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

In [9]:
# now we filter out images that don't have duplicates
df = df_data[df_data['duplicates'] == 'no_duplicates']

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicates
10,HAM_0001396,ISIC_0025276,bkl,histo,55.0,female,trunk,no_duplicates
15,HAM_0007207,ISIC_0031326,bkl,histo,65.0,male,back,no_duplicates
20,HAM_0006071,ISIC_0032343,bkl,histo,70.0,female,face,no_duplicates
33,HAM_0005612,ISIC_0024981,bkl,histo,80.0,male,scalp,no_duplicates
34,HAM_0005388,ISIC_0027815,bkl,histo,80.0,male,chest,no_duplicates


In [10]:
df['dx'].value_counts()

nv       4415
bkl       440
mel       230
bcc       175
akiec     151
vasc       64
df         39
Name: dx, dtype: int64

In [11]:
# now we create a val set using df because we are sure that none of these images
# have augmented duplicates in the train set
y = df['dx']

_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)

df_val.shape

(938, 8)

In [12]:
df_val['dx'].value_counts()

nv       751
bkl       75
mel       39
bcc       30
akiec     26
vasc      11
df         6
Name: dx, dtype: int64

### Create a train set that excludes images that are in the val set

In [13]:
# This set will be df_data excluding all rows that are in the val set

# This function identifies if an image is part of the train
# or val set.
def identify_val_rows(x):
    # create a list of all the lesion_id's in the val set
    val_list = list(df_val['image_id'])
    
    if str(x) in val_list:
        return 'val'
    else:
        return 'train'

# identify train and val rows

# create a new colum that is a copy of the image_id column
df['train_or_val'] = df['image_id']
# apply the function to this new column
df['train_or_val'] = df['train_or_val'].apply(identify_val_rows)
   
# filter out train rows
df_train = df[df['train_or_val'] == 'train']


print(len(df_train))
print(len(df_val))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['train_or_val'] = df['image_id']


4576
938


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['train_or_val'] = df['train_or_val'].apply(identify_val_rows)


In [14]:
# nv       5954
# mel      1074
# bkl      1024
# bcc       484
# akiec     301
# vasc      131
# df        109
# Name: dx, dtype: int64
        
df_train['dx'].value_counts()

nv       3664
bkl       365
mel       191
bcc       145
akiec     125
vasc       53
df         33
Name: dx, dtype: int64

In [15]:
df_val['dx'].value_counts()

nv       751
bkl       75
mel       39
bcc       30
akiec     26
vasc      11
df         6
Name: dx, dtype: int64

### Transfer the Images into the Folders

In [16]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [17]:
# Get a list of images in each of the two folders
folder_1 = os.listdir('./input/ham10000_images_part_1')
folder_2 = os.listdir('./input/ham10000_images_part_2')

# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

# Transfer the train images

for image in train_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('./input/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('./input/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)
        
# Transfer the val images

for image in val_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('./input/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('./input/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [18]:
# check how many train images we have in each folder

print(len(os.listdir('Data_Images/train_dir/nv')))
print(len(os.listdir('Data_Images/train_dir/mel')))
print(len(os.listdir('Data_Images/train_dir/bkl')))
print(len(os.listdir('Data_Images/train_dir/bcc')))
print(len(os.listdir('Data_Images/train_dir/akiec')))
print(len(os.listdir('Data_Images/train_dir/vasc')))
print(len(os.listdir('Data_Images/train_dir/df')))

3664
191
365
145
125
53
33


In [19]:
# check how many val images we have in each folder

print(len(os.listdir('Data_Images/val_dir/nv')))
print(len(os.listdir('Data_Images/val_dir/mel')))
print(len(os.listdir('Data_Images/val_dir/bkl')))
print(len(os.listdir('Data_Images/val_dir/bcc')))
print(len(os.listdir('Data_Images/val_dir/akiec')))
print(len(os.listdir('Data_Images/val_dir/vasc')))
print(len(os.listdir('Data_Images/val_dir/df')))

751
39
75
30
26
11
6


### Copy the Train images into aug_dir

In [20]:
# note that we are not augmenting class 'nv'
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    
    # We are creating temporary directories here because we delete these directories later
    # create a base dir
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    # create a dir within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # list all images in that directory
    img_list = os.listdir('Data_Images/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir e.g. class 'mel'
    for fname in img_list:
            # source path to image
            src = os.path.join('Data_Images/train_dir/' + img_class, fname)
            # destination path to image
            dst = os.path.join(img_dir, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)


    # point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'Data_Images/train_dir/' + img_class

    # Create a data generator
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        #brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir=save_path,
                                           save_format='jpg',
                                                    target_size=(224,224),
                                                    batch_size=batch_size)



    # Generate the augmented images and add them to the training folders
    
    ###########
    
    num_aug_images_wanted = 700 # total number of images we want to have in each class
    
    ###########
    
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0,num_batches):

        imgs, labels = next(aug_datagen)
        
    # delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 191 images belonging to 1 classes.
Found 365 images belonging to 1 classes.
Found 145 images belonging to 1 classes.
Found 125 images belonging to 1 classes.
Found 53 images belonging to 1 classes.
Found 33 images belonging to 1 classes.


In [21]:
# note that we are not augmenting class 'nv'
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    
    # We are creating temporary directories here because we delete these directories later
    # create a base dir
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    # create a dir within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # list all images in that directory
    img_list = os.listdir('Data_Images/val_dir/' + img_class)

    # Copy images from the class train dir to the img_dir e.g. class 'mel'
    for fname in img_list:
            # source path to image
            src = os.path.join('Data_Images/val_dir/' + img_class, fname)
            # destination path to image
            dst = os.path.join(img_dir, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)


    # point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'Data_Images/val_dir/' + img_class

    # Create a data generator
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        #brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir=save_path,
                                           save_format='jpg',
                                                    target_size=(224,224),
                                                    batch_size=batch_size)



    # Generate the augmented images and add them to the training folders
    
    ###########
    
    num_aug_images_wanted = 75 # total number of images we want to have in each class
    
    ###########
    
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0,num_batches):

        imgs, labels = next(aug_datagen)
        
    # delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 39 images belonging to 1 classes.
Found 75 images belonging to 1 classes.
Found 30 images belonging to 1 classes.
Found 26 images belonging to 1 classes.
Found 11 images belonging to 1 classes.
Found 6 images belonging to 1 classes.


In [22]:
# check how many train images we have in each folder

print(len(os.listdir('Data_Images/train_dir/nv')))
print(len(os.listdir('Data_Images/train_dir/mel')))
print(len(os.listdir('Data_Images/train_dir/bkl')))
print(len(os.listdir('Data_Images/train_dir/bcc')))
print(len(os.listdir('Data_Images/train_dir/akiec')))
print(len(os.listdir('Data_Images/train_dir/vasc')))
print(len(os.listdir('Data_Images/train_dir/df')))

3664
723
715
725
625
421
495


In [23]:
# check how many val images we have in each folder

print(len(os.listdir('Data_Images/val_dir/nv')))
print(len(os.listdir('Data_Images/val_dir/mel')))
print(len(os.listdir('Data_Images/val_dir/bkl')))
print(len(os.listdir('Data_Images/val_dir/bcc')))
print(len(os.listdir('Data_Images/val_dir/akiec')))
print(len(os.listdir('Data_Images/val_dir/vasc')))
print(len(os.listdir('Data_Images/val_dir/df')))

751
78
75
60
52
33
18


In [24]:
#remove some images from nv randomly 
from random import sample
files = os.listdir('Data_Images/train_dir/nv')
for file in sample(files, len(files)-900):
    os.remove('Data_Images/train_dir/nv/'+file)
    
from random import sample
files = os.listdir('Data_Images/val_dir/nv')
for file in sample(files, len(files)-90):
    os.remove('Data_Images/val_dir/nv/'+file)

In [27]:
# check how many train images we have in each folder

print(len(os.listdir('Data_Images/train_dir/nv')))
print(len(os.listdir('Data_Images/train_dir/mel')))
print(len(os.listdir('Data_Images/train_dir/bkl')))
print(len(os.listdir('Data_Images/train_dir/bcc')))
print(len(os.listdir('Data_Images/train_dir/akiec')))
print(len(os.listdir('Data_Images/train_dir/vasc')))
print(len(os.listdir('Data_Images/train_dir/df')))

900
723
715
725
625
421
495


In [28]:
# check how many val images we have in each folder

print(len(os.listdir('Data_Images/val_dir/nv')))
print(len(os.listdir('Data_Images/val_dir/mel')))
print(len(os.listdir('Data_Images/val_dir/bkl')))
print(len(os.listdir('Data_Images/val_dir/bcc')))
print(len(os.listdir('Data_Images/val_dir/akiec')))
print(len(os.listdir('Data_Images/val_dir/vasc')))
print(len(os.listdir('Data_Images/val_dir/df')))

90
78
75
60
52
33
18


### Visualize 50 augmented images

In [None]:
# End of Data Preparation
### ===================================================================================== ###
# Start of Model Building

### Handling Imbalanced Classes

In [None]:
# count_classes = df['dx'].value_counts()
# count_classes.plot(kind='bar')

In [None]:
# df['dx'].value_counts()

**Under Sampling the Data**<br>
Using Imblearn Library

In [None]:
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state=101)

# df_resampled, y_resampled = rus.fit_resample(df, df['dx'])
# df_resampled.head()  

In [None]:
# df_resampled['dx'].value_counts()

In [None]:
# # smote_tomek = SMOTETomek(random_state=101)
# # df_resampled, y_resampled = smote_tomek.fit_resample(df, df['dx'])

# smote_enn = SMOTEENN(random_state=0)
# df_resampled, y_resampled = smote_enn.fit_resample(df, df['dx'])
# df_resampled['dx'].value_counts()