In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# create a base dir
base_dir ='base_dir'
os.mkdir(base_dir)

In [3]:
# creating a train dir inside base dir
train_dir = os.path.join(base_dir,'train_dir')
os.mkdir(train_dir)

In [4]:
# creating a val dir inside base_dir
val_dir = os.path.join(base_dir,'val_dir')
os.mkdir(val_dir)

In [5]:
# inside Train and val folders we create separate folders for each class
# first within train folder
nv = os.path.join(train_dir,'nv')
os.mkdir(nv)

In [6]:
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

In [7]:
# create new folders inside val_dir
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

In [8]:
# creating Train and val sets
df_data = pd.read_csv('HAM10000_metadata.csv')
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [9]:
df_data['dx'].unique()

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In total 7 unique classes are present under dx

In [10]:
 # this will tell us how many images are associated with each lesion_id
df = df_data.groupby('lesion_id').count()

In [11]:
df.head()

Unnamed: 0_level_0,image_id,dx,dx_type,age,sex,localization
lesion_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HAM_0000000,2,2,2,2,2,2
HAM_0000001,1,1,1,1,1,1
HAM_0000002,3,3,3,3,3,3
HAM_0000003,1,1,1,1,1,1
HAM_0000004,1,1,1,1,1,1


In [12]:
# now we filter out lesion_id's that have only one image associated with it
df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000001,1,1,1,1,1,1
1,HAM_0000003,1,1,1,1,1,1
2,HAM_0000004,1,1,1,1,1,1
3,HAM_0000007,1,1,1,1,1,1
4,HAM_0000008,1,1,1,1,1,1


In [13]:
# here we identify lesion_id's that have duplicate images and those that have only
# one image.

def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# create a new colum that is a copy of the lesion_id column
df_data['duplicates'] = df_data['lesion_id']
# apply the function to this new column
df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)

df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicates
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,has_duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,has_duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,has_duplicates
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,has_duplicates
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,has_duplicates


In [14]:
df_data['duplicates'].value_counts()

no_duplicates     5514
has_duplicates    4501
Name: duplicates, dtype: int64

In [15]:
# now we filter out images that don't have duplicates
df = df_data[df_data['duplicates'] == 'no_duplicates']

df.shape

(5514, 8)

In [16]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,duplicates
10,HAM_0001396,ISIC_0025276,bkl,histo,55.0,female,trunk,no_duplicates
15,HAM_0007207,ISIC_0031326,bkl,histo,65.0,male,back,no_duplicates
20,HAM_0006071,ISIC_0032343,bkl,histo,70.0,female,face,no_duplicates
33,HAM_0005612,ISIC_0024981,bkl,histo,80.0,male,scalp,no_duplicates
34,HAM_0005388,ISIC_0027815,bkl,histo,80.0,male,chest,no_duplicates


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# now we create a val set using df because we are sure that none of these images
# have augmented duplicates in the train set
y = df['dx']

_, df_val = train_test_split(df, test_size=0.3, random_state=101, stratify=y)

df_val.shape

(1655, 8)

In [19]:
df_val['dx'].value_counts()


nv       1325
bkl       132
mel        69
bcc        53
akiec      45
vasc       19
df         12
Name: dx, dtype: int64

Create a train set that excludes images that are in the val set

In [20]:
# This function identifies if an image is part of the train
# or val set.
def identify_val_rows(x):
    # create a list of all the lesion_id's in the val set
    val_list = list(df_val['image_id'])
    
    if str(x) in val_list:
        return 'val'
    else:
        return 'train'

# identify train and val rows

# create a new colum that is a copy of the image_id column
df_data['train_or_val'] = df_data['image_id']
# apply the function to this new column
df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)
   
# filter out train rows
df_train = df_data[df_data['train_or_val'] == 'train']


print(len(df_train))
print(len(df_val))

8360
1655


In [21]:
df_train['dx'].value_counts()

nv       5380
mel      1044
bkl       967
bcc       461
akiec     282
vasc      123
df        103
Name: dx, dtype: int64

In [22]:
df_val['dx'].value_counts()

nv       1325
bkl       132
mel        69
bcc        53
akiec      45
vasc       19
df         12
Name: dx, dtype: int64

Transfer the Images into the Folders

In [23]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [24]:
df_data.head()

Unnamed: 0_level_0,lesion_id,dx,dx_type,age,sex,localization,duplicates,train_or_val
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ISIC_0027419,HAM_0000118,bkl,histo,80.0,male,scalp,has_duplicates,train
ISIC_0025030,HAM_0000118,bkl,histo,80.0,male,scalp,has_duplicates,train
ISIC_0026769,HAM_0002730,bkl,histo,80.0,male,scalp,has_duplicates,train
ISIC_0025661,HAM_0002730,bkl,histo,80.0,male,scalp,has_duplicates,train
ISIC_0031633,HAM_0001466,bkl,histo,75.0,male,ear,has_duplicates,train


In [25]:
# Get a list of images in each of the two folders
folder_1 = os.listdir('ham10000_images_part_1')
folder_2 = os.listdir('ham10000_images_part_2')

In [26]:
# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

In [27]:
train_list

['ISIC_0027419',
 'ISIC_0025030',
 'ISIC_0026769',
 'ISIC_0025661',
 'ISIC_0031633',
 'ISIC_0027850',
 'ISIC_0029176',
 'ISIC_0029068',
 'ISIC_0025837',
 'ISIC_0025209',
 'ISIC_0025276',
 'ISIC_0029396',
 'ISIC_0025984',
 'ISIC_0025767',
 'ISIC_0032417',
 'ISIC_0031326',
 'ISIC_0025915',
 'ISIC_0031029',
 'ISIC_0029836',
 'ISIC_0032129',
 'ISIC_0032343',
 'ISIC_0025033',
 'ISIC_0027310',
 'ISIC_0032128',
 'ISIC_0025937',
 'ISIC_0027828',
 'ISIC_0029291',
 'ISIC_0030698',
 'ISIC_0025567',
 'ISIC_0031753',
 'ISIC_0026835',
 'ISIC_0031159',
 'ISIC_0031017',
 'ISIC_0024981',
 'ISIC_0027815',
 'ISIC_0024324',
 'ISIC_0029559',
 'ISIC_0030661',
 'ISIC_0027053',
 'ISIC_0028560',
 'ISIC_0031650',
 'ISIC_0029161',
 'ISIC_0026273',
 'ISIC_0025076',
 'ISIC_0029687',
 'ISIC_0025642',
 'ISIC_0025819',
 'ISIC_0032013',
 'ISIC_0031691',
 'ISIC_0025419',
 'ISIC_0030105',
 'ISIC_0025016',
 'ISIC_0029147',
 'ISIC_0029301',
 'ISIC_0026637',
 'ISIC_0030377',
 'ISIC_0027960',
 'ISIC_0026955',
 'ISIC_0028052

In [28]:
label = df_data.loc['ISIC_0024324','dx']

In [29]:
label

'bkl'

In [32]:
import shutil, sys                                                                                                                                                    
from shutil import copyfile
from shutil import copy

In [33]:
# Transfer the train images

for image in train_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [34]:
# Transfer the val images

for image in val_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [35]:
# check how many train images we have in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

5380
1044
967
461
282
123
103


In [36]:
# check how many val images we have in each folder

print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

1325
69
132
53
45
19
12


In [43]:
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img

In [46]:
# note that we are not augmenting class 'nv'
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    
    # We are creating temporary directories here because we delete these directories later
    # create a base dir
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    # create a dir within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # list all images in that directory
    img_list = os.listdir('base_dir/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir e.g. class 'mel'
    for fname in img_list:
            # source path to image
            src = os.path.join('base_dir/train_dir/' + img_class, fname)
            # destination path to image
            dst = os.path.join(img_dir, fname)
            # copy the image from the source to the destination
            shutil.copyfile(src, dst)


    # point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class

    # Create a data generator
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        #brightness_range=(0.9,1.1),
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir=save_path,
                                           save_format='jpg',
                                                    target_size=(224,224),
                                                    batch_size=batch_size)



    # Generate the augmented images and add them to the training folders
    
    ###########
    
    num_aug_images_wanted = 6000 # total number of images we want to have in each class
    
    ###########
    
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0,num_batches):

        imgs, labels = next(aug_datagen)
        
    # delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 6020 images belonging to 1 classes.
Found 967 images belonging to 1 classes.
Found 461 images belonging to 1 classes.
Found 282 images belonging to 1 classes.
Found 123 images belonging to 1 classes.
Found 103 images belonging to 1 classes.


In [47]:
# Check how many train images we now have in each folder.
# This is the original images plus the augmented images.

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

5380
6020
5852
5582
5690
4970
4170
