In [1]:
# General
import pandas as pd
import os
import hashlib
from tqdm import tqdm
import shutil

# Modeling
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from keras.layers import Dropout

In [2]:
path_to_metadata = os.path.join('data', 'fitzpatrick17k.csv')
path_to_images = os.path.join('data', 'Images')

metadata = pd.read_csv(path_to_metadata)

In [7]:
# For every image in the path_to_images directory, calculate the hash and change the name of the image to the hash 
for image in tqdm(os.listdir(path_to_images)):
    image_path = os.path.join(path_to_images, image)
    with open(image_path, 'rb') as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
    os.rename(image_path, os.path.join(path_to_images, file_hash + '.jpg'))

100%|██████████| 16527/16527 [03:11<00:00, 86.10it/s]


#### Splitting the images into test-train-validation sets. We have to keep in mind the proportions of the classes for each set, as well as the fitzpatrick scale.

Lets look at the distribution of the classes in the dataset.

In [3]:
# Now sort by fitzpatric_scale 
metadata.groupby('fitzpatrick_scale').size().sort_values(ascending=False)

fitzpatrick_scale
 2    4808
 3    3308
 1    2947
 4    2781
 5    1533
 6     635
-1     565
dtype: int64

-1 class can be interpreted as the absence of class ($Nan$)

In [4]:
metadata.groupby(['label']).size().sort_values(ascending=False)

label
psoriasis                      653
squamous cell carcinoma        581
lichen planus                  491
basal cell carcinoma           468
allergic contact dermatitis    430
                              ... 
paronychia                      59
erythema elevatum diutinum      55
pustular psoriasis              53
pilomatricoma                   53
xanthomas                       53
Length: 114, dtype: int64

#### Now lets split the data into test-train-validation sets. We will use the following proportions:

- 70% train
- 15% validation
- 15% test

In [64]:
for subset in metadata.groupby(['fitzpatrick_scale', 'label']):
    # Select 70% of the images for training and 15% for validation and 15% for testing
    train = subset[1].sample(frac=0.7)['md5hash']
    validation = subset[1].drop(train.index).sample(frac=0.5)['md5hash']
    test = subset[1].drop(train.index).drop(validation.index)['md5hash']

    metadata.loc[metadata['md5hash'].isin(train), 'subset'] = 'train'
    metadata.loc[metadata['md5hash'].isin(validation), 'subset'] = 'validation'
    metadata.loc[metadata['md5hash'].isin(test), 'subset'] = 'test'

In [78]:
os.path.join('Data','Images', subset[0], subset[1])

'Data\\Images\\test\\acanthosis nigricans'

In [None]:
for subset, label in metadata.groupby(['subset', 'label']):
    os.makedirs(os.path.join(path_to_images, subset[0], subset[1]), exist_ok=True)
    for image in label['md5hash']:
        original_path = os.path.join(path_to_images, image + '.jpg')
        new_path = os.path.join(path_to_images, subset[0], subset[1], image + '.jpg')
        try :
            shutil.move(original_path, new_path)
        except:
            print('File not downloaded')

#### Now lets try to build a CNN model to classify the images.

In [11]:
# Building a CNN 
# Create a data generator
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Create a data generator for the training data
train_generator = datagen.flow_from_directory(
    os.path.join(path_to_images, 'train'),
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical')

# Create a data generator for the validation data
validation_generator = datagen.flow_from_directory(
    os.path.join(path_to_images, 'validation'),
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical')

# Create a data generator for the test data
test_generator = datagen.flow_from_directory(
    os.path.join(path_to_images, 'test'),
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical')

# Load the VGG16 network
vgg = VGG16(include_top=False, weights='imagenet', input_shape=(224, 224, 3))

# Freeze the layers
for layer in vgg.layers:
    layer.trainable = False

# Flatten the output of the VGG16 network because it is a 2D array
flatten = layers.Flatten()(vgg.output)

# Add a fully connected layer with 1024 neurons
dense = layers.Dense(1024, activation='relu')(flatten)
dropout = Dropout(0.5)(dense)

# Add a fully connected layer with 256 neurons
dense = layers.Dense(256, activation='relu')(dense)
dropout = Dropout(0.5)(dense)

# We have 114 classes so we need a dense layer with 114 neurons
output = layers.Dense(114, activation='softmax')(dense)

# Create a model
model = models.Model(vgg.input, output)

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

Found 11562 images belonging to 114 classes.
Found 2421 images belonging to 114 classes.
Found 2543 images belonging to 114 classes.


In [12]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [13]:
# Train the model
model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=5,
    steps_per_epoch=len(train_generator),
    validation_steps=len(validation_generator))

Epoch 1/5
  5/362 [..............................] - ETA: 17:58 - loss: 34.4806 - accuracy: 0.0063

KeyboardInterrupt: 