<h1><center><strong>CNN Model for Image Classification of Skin Lesions</strong> <br> (Melanoma, Nevus, Seborrheic-Keratosis)</center></h1>

## **Importing Liabraries**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, ZeroPadding2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import SGD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import random
import matplotlib.pyplot as plt
import itertools
import os
import pandas as pd

## **Data Path Setup**

In [None]:
train_path = 'data/skin-lesions/train/'
test_path = 'data/skin-lesions/test/'
valid_path = 'data/skin-lesions/valid/'

## **Data Setup & Preparation**

In [None]:
train_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg19.preprocess_input).flow_from_directory(directory=train_path, target_size=(224,224), classes=['melanoma', 'nevus', 'seborrheic_keratosis'], batch_size=10)

valid_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg19.preprocess_input).flow_from_directory(directory=valid_path, target_size=(224,224), classes=['melanoma', 'nevus', 'seborrheic_keratosis'], batch_size=10)

test_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg19.preprocess_input).flow_from_directory(directory=test_path, target_size=(224,224), classes=['melanoma', 'nevus', 'seborrheic_keratosis'], batch_size=10, shuffle=False) # shuffle=False to keep labels in same order as test set at time of accuracy calculation

## **Class Imbalance**

In [None]:
num_melanoma = len(os.listdir(train_path + 'melanoma'))
num_nevus = len(os.listdir(train_path + 'nevus'))
num_seborrheic_keratosis = len(os.listdir(train_path + 'seborrheic_keratosis'))

print('Number of melanoma images: ', num_melanoma)
print('Number of nevus images: ', num_nevus)
print('Number of seborrheic keratosis images: ', num_seborrheic_keratosis)

## **Data Augmentation**

In [None]:
num_gen_melanoma = 100 # number of images to generate for melanoma class in order to balance the dataset

def dataAugMelanoma():

    gen_mel = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest',
        shear_range=0.15,
        zoom_range=0.1,
        channel_shift_range=10.0,
    )

    folder_path = 'data/skin-lesions/train/melanoma/'

    for i in range(num_gen_melanoma):
        chosen_image = random.choice(os.listdir(folder_path))
        image_path = folder_path + chosen_image
        assert os.path.isfile(image_path)
        image = np.expand_dims(plt.imread(image_path), 0)
        aug_iter = gen_mel.flow(image)
        aug_images = [next(aug_iter)[0].astype(np.uint8) for i in range(1)]
        for aug_image in aug_images:
            plt.imsave(folder_path + 'aug/' + 'aug_' + str(i + 816) + '.jpg', aug_image)
            
    print('Number of melanoma images after augmentation: ', len(os.listdir(folder_path)))
    
    return pd.DataFrame({'filepath': [os.path.join(folder_path, img) for img in os.listdir(folder_path)], 'label': '0'})

In [None]:
num_gen_seb = 112 # number of images to generate for seborrheic keratosis class in order to balance the dataset

def dataAugSeb():
    
    gen_seb = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest',
        shear_range=0.15,
        zoom_range=0.1,
        channel_shift_range=10.0,
    )

    folder_path = 'data/skin-lesions/train/seborrheic_keratosis/'

    for i in range(num_gen_seb):
        chosen_image = random.choice(os.listdir(folder_path))
        image_path = folder_path + chosen_image
        assert os.path.isfile(image_path)
        image = np.expand_dims(plt.imread(image_path), 0)
        aug_iter = gen_seb.flow(image)
        aug_images = [next(aug_iter)[0].astype(np.uint8) for i in range(1)]
        for aug_image in aug_images:
            plt.imsave(folder_path + 'aug/' + 'aug_' + str(i + 644) + '.jpg', aug_image)
            
    print('Number of seborrheic keratosis images after augmentation: ', len(os.listdir(folder_path)))
    
    return pd.DataFrame({'filepath': [os.path.join(folder_path, img) for img in os.listdir(folder_path)], 'label': '2'})

In [None]:
random_images = []
for class_name in train_batches.class_indices.keys():
    class_index = train_batches.class_indices[class_name]
    class_indices = np.where(train_batches.classes == class_index)[0]
    selected_indices = random.sample(list(class_indices), min(10, len(class_indices)))
    for i in selected_indices:
        random_images.append(train_batches.filepaths[i])


print()
print('Random Images from Training Set:')
print()

# Plot the images
fig, axes = plt.subplots(nrows=10, ncols=3, figsize=(10, 30))
for i, image_path in enumerate(random_images):
    j = 0
    if i < 10:
        j = 0
    elif i < 20 and i > 9:
        j = 1
    else:
        j = 2
        
    i = i % 10
    img = plt.imread(image_path)
    axes[i, j].imshow(img)
    axes[i, j].axis('off')
    axes[i, j].set_title(image_path.split('/')[-1].split('\\')[0])
    
plt.tight_layout()
plt.show()


## **Building a Sequential CNN Model**

In [None]:
model = Sequential([
    
    Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding = 'same', input_shape=(224, 224, 3)),
    Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding = 'same'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding = 'same'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding = 'same'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding = 'same'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    
    Dense(256, activation='relu'),
    Dropout(0.3),
    
    Dense(256, activation='relu'),
    Dropout(0.3),
    
    Dense(256, activation='relu'),
    Dropout(0.3),
    
    Dense(256, activation='relu'),
    Dropout(0.3),
    
    Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    
    Dense(8, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    
    Dense(4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    
    Dense(units=3, activation='softmax'),
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

## **k-Cross Fold Validation**

In [None]:
k = 10 # number of folds

skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

acc_per_fold = []

fold_no = 1

while fold_no <= k:
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    x_train, x_test = train_batches, test_batches
    
    filepaths = x_train.filepaths
    labels = x_train.labels
    
    df1 = pd.DataFrame({"filepath": filepaths, "label": labels})
    
    # data augmentation
    df2 = dataAugMelanoma()
    df3 = dataAugSeb()
    
    df = pd.concat([df1, df2, df3], ignore_index=True)
    
    x_train = ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg19.preprocess_input).flow_from_dataframe(dataframe=df, x_col="filepath", y_col="label", target_size=(224,224), batch_size=10)
    
    train_batches.next() # move to next batch
    test_batches.next()
        
    # Fit data to model
    history = model.fit(x_train, validation_data=valid_batches, batch_size=10, epochs=15, verbose=1)
    
    scores = model.evaluate(x_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    
    # clear folder files
    folder_path = 'data/skin-lesions/train/melanoma/aug/'
    for file in os.listdir(folder_path):
        os.remove(os.path.join(folder_path, file))
    
    folder_path = 'data/skin-lesions/train/seborrheic_keratosis/aug/'
    for file in os.listdir(folder_path):
        os.remove(os.path.join(folder_path, file))
    
    fold_no = fold_no + 1
    
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {acc_per_fold[i]}%')
    
print('------------------------------------------------------------------------')

print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')

## **Loss v/s Epoch Plot**

In [None]:
plt.plot(model.history.history['loss'], label='train loss')
plt.plot(model.history.history['val_loss'], label='val loss')
plt.legend()
plt.show()

## **Saving Model HDF5 format**

In [None]:
import os.path

if os.path.isfile('models/model_8.h5') is False:
    model.save('models/model_8.h5')

# **Loading Saved Model**

In [None]:
model = load_model('models/model_11.h5')

In [None]:
model.summary()

# **Predict Results**

In [None]:
predictions = model.predict(x=test_batches, verbose=1)

In [None]:
predictions

In [None]:
predictions = np.argmax(predictions, axis=-1)

In [None]:
predictions

In [None]:
# save predictions to csv
import pandas as pd

pred = pd.DataFrame(predictions)

pred.to_csv('predictions_11.csv')

In [None]:
predictions = np.argmax(predictions, axis=-1)

predictions

In [None]:
actuals = test_batches.classes

actuals

In [None]:
accuracy = np.mean(predictions == actuals)

accuracy

# **Confusion Matrix**

In [None]:
cm = confusion_matrix(y_true=actuals, y_pred=predictions)

In [None]:
def plot_confusion_matrix(cm, classes, normalizer=False, title="Confusion Matrix", cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontdict={'size':14, 'weight':'bold'})
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalizer:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    thresh=cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, cm[i,j],
                horizontalalignment="center",
                color="white" if cm[i,j] > thresh else "black")
        plt.tight_layout()
        plt.ylabel('True Label', fontdict={'size':14, 'weight':'bold'})
        plt.xlabel('Predicted Label', fontdict={'size':14, 'weight':'bold'}) 

In [None]:
cm_plot_labels = ['melanoma', 'nevus', 'seborrheic_keratosis']
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')