
```
@data{DVN/DBW86T_2018,
author = {Tschandl, Philipp},
publisher = {Harvard Dataverse},
title = {{The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions}},
UNF = {UNF:6:KCZFcBLiFE5ObWcTc2ZBOA==},
year = {2018},
version = {V4},
doi = {10.7910/DVN/DBW86T},
url = {https://doi.org/10.7910/DVN/DBW86T}
}
```



The HAM10000 dataset served as the training set for the ISIC 2018 challenge (Task 3), with the same sources contributing the majority of the validation- and test-set as well. The test-set images are available herein as ISIC2018_Task3_Test_Images.zip (1511 images), the ground-truth in the same format as the HAM10000 data (public since 2023) is available as ISIC2018_Task3_Test_GroundTruth.csv.. The ISIC-Archive also provides the challenge images and metadata (training, validation, test) at their "ISIC Challenge Datasets" page.

https://challenge.isic-archive.com/data/#2018

In [None]:
!pip install opendatasets
import opendatasets as od
import pandas as pd


[0m

In [None]:
# upload kaggle.json and the data will automatically be read

od.download("https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000", force=True)

path = "skin-cancer-mnist-ham10000/"

Downloading skin-cancer-mnist-ham10000.zip to ./skin-cancer-mnist-ham10000


100%|██████████| 5.20G/5.20G [00:27<00:00, 204MB/s] 





In [None]:
meta = pd.read_csv(path + 'HAM10000_metadata.csv')
meta.head()

In [None]:
# description from ISIC
lesion_types = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

lesion_type_short = {
    'nv': 0,
    'mel': 1,
    'bkl': 2,
    'bcc': 3,
    'akiec': 4,
    'vasc': 5,
    'df': 6
}

lesion_names = ['Melanocytic nevi','Melanoma','Benign keratosis-like lesions ',
               'Basal cell carcinoma','Actinic keratoses','Vascular lesions',
               'Dermatofibroma']

lesion_names_short = ['nv','mel','bkl','bcc','akiec','vasc','df']

meta['lesion_type']=meta['dx'].map(lesion_types)
meta['lesion_type_short'] = meta['dx'].map(lesion_type_short)

print('Total number of images %i' %(len(meta)))

meta['lesion_type'].value_counts()

## Data Augmentation

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# retrieve images
images = np.array(meta['image_id'])
HAM_part1 = path + 'HAM10000_images_part_1/' + str(images[1]) + '.jpg'

import cv2
from cv2 import imread, resize

img = imread(HAM_part1)
resized_img = resize(img,(100,100))

# show one exampe image

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.imshow(img[:,:,::-1])
plt.title('Original image')
plt.subplot(1,2,2)
plt.imshow(resized_img[:,:,::-1])
plt.title('Resized image')
plt.show()

In [None]:
# augmenting images to reduce class imbalance
# number allows to specify how many augmented versions of the
# original image shall be produced
def augment(image, number:int = 5):
    # produce new images by rotating of flipping the original one
    # this helps to increase the dimension of the dataset, avoiding overfitting of a single class
    if number == 1:
      a = cv2.rotate(image,cv2.ROTATE_90_CLOCKWISE)
      return a

    if number == 2:
      a = cv2.rotate(image,cv2.ROTATE_90_CLOCKWISE)
      b = cv2.rotate(image,cv2.ROTATE_90_COUNTERCLOCKWISE)
      return a, b

    if number == 3:
      a = cv2.rotate(image,cv2.ROTATE_90_CLOCKWISE)
      b = cv2.rotate(image,cv2.ROTATE_90_COUNTERCLOCKWISE)
      c = cv2.rotate(image,cv2.ROTATE_180)
      return a, b, c
    if number == 4:
      a = cv2.rotate(image,cv2.ROTATE_90_CLOCKWISE)
      b = cv2.rotate(image,cv2.ROTATE_90_COUNTERCLOCKWISE)
      c = cv2.rotate(image,cv2.ROTATE_180)
      d = cv2.flip(img2,0)
      return a, b, c, d
    if number == 5:
      a = cv2.rotate(image,cv2.ROTATE_90_CLOCKWISE)
      b = cv2.rotate(image,cv2.ROTATE_90_COUNTERCLOCKWISE)
      c = cv2.rotate(image,cv2.ROTATE_180)
      d = cv2.flip(image,0)
      e = cv2.flip(image,1)
      return a, b, c, d, e

    else:
      return None

new_img = augment(resized_img)

plt.figure(figsize=(10,8))
plt.subplot(2,3,1)
plt.imshow(resized_img[:,:,::-1])
for i in range(5):
    plt.subplot(2,3,2+i)
    plt.imshow(new_img[i][:,:,::-1])
plt.tight_layout()
plt.show()

In [None]:
import os
# function for importing images from the two folders

def image_importer(path, append=True, X:list=[], y:list=[]):
  dir_list = os.listdir(path)


  if append:
    # allows for adding images to established containers
    pass
  else:
    # initialize empty containers for predictor and target images
    X = []
    y = []

  for i in range(len(dir_list)):
      fname_image = dir_list[i]
      fname_id = fname_image.replace('.jpg','')

      # features|predictors
      folder = path + str(fname_image)
      img = imread(folder)
      resized_img = resize(img,(100, 100))
      X.append(resized_img)

      # targets|predicted
      output = np.array(meta[meta['image_id'] == fname_id].lesion_type_short)
      y.append(output[0])

      # add more images for class between 1-6, by rotation and flipping
      if output != 0:
          new_img = augment(resized_img)
          for i in range(5):
              X.append(new_img[i])
              y.append(output[0])

      # print progress
      if append==False and i % int(100) == 0: # (not append)
         # print(i,'images loaded')
         # "{char_start:filler<right^center>left-ragged[f_loat,d_ecimal, s_tring]}".format(...)
          print("{0:-<7d}".format(i), "images loaded") # d stands for decimal
      elif append and (len(dir_list)+i) % int(100) == 0:
        #print((len(X)+i), "images loaded")
          print("{0:-<7d}".format(len(dir_list)+i), "images loaded")

  return X, y

In [None]:
part1 = path + 'HAM10000_images_part_1/'
part2 = path + 'HAM10000_images_part_2/'

# importing the images and storing them the feature variable X
X, y = image_importer(part1, append=False)
# appending the second part of HAM10000
X, y = image_importer(part2, append=True, X=X, y=y)

In [None]:
# data shall be comprised of 10_015 * 6 = 60090 images
# X and y are still lists
print(len(X) , len(y))

\begin{array}{|c|c|} \hline
\text{Lesion Type} & \text{Observations} & \text{Augmentation} & \text{Total} \\ \hline
%\text{Total number of images} & 10015 & & \\
\text{Melanocytic nevy} & 6705 & None & 6705  \\
\text{Melanoma} & 1113 & \times 6 & 6678 \\
\text{Benign keratosis-like lesions} & 1099 & \times 6 & 6594 \\
\text{Basal cell carcinoma} & 514 & \times 6 & 3084 \\
\text{Actinic keratoses} & 327 & \times 6 & 1962 \\
\text{Vascular lesions} & 142 & \times 6 & 852 \\
\text{Dermatofibroma} & 115 & \times 6 & 690 \\ \hline
\text{Σ} & 10015  &  & 26565 \\ \hline
\end{array}

In [None]:
from keras.utils.np_utils import to_categorical

# converting image data to
X = np.array(X)
y = np.array(y)

# convert 7 classes into categorical encodings
y_train = to_categorical(y, num_classes=7)


In [None]:
# matrix dimensions
print(X.shape)
print(y.shape)

X has 4 dimensions:

+ Dimension 1: It has a size of 26565, indicating that X contains 26565 elements along this dimension. This could represent the number of samples or data points in your dataset.

+ Dimension 2: It has a size of 100, meaning that each sample in X has a length or height of 100 units.

+ Dimension 3: It also has a size of 100, indicating that each sample in X has a width of 100 units.

+ Dimension 4: It has a size of 3, suggesting that each element in the X array is a 3-channel image. The three channels typically correspond to the red, green, and blue color channels, which form an RGB image.

+ So, in summary, X is a 4-dimensional array where the first dimension represents the number of samples, and the last three dimensions represent the height, width, and number of channels (in this case, 3 for RGB) of each sample image, respectively.

In [None]:
from sklearn.model_selection import train_test_split

# split in 80% training and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y_train, test_size=0.33, random_state=50, stratify=y)


print('Train dataset shape', X_train.shape)
print('Test dataset shape', X_test.shape)

In [None]:
fig, ax = plt.subplots(1, 7, figsize=(30, 30))
for i in range(7):
    ax[i].set_axis_off()
    ax[i].imshow(X_train[i]) #, cmap="Accent"
    ax[i].set_title(lesion_names[np.argmax(y_train[i])])

The np.argmax function is a NumPy function that returns the index of the maximum value in an array. In the provided code, np.argmax(y_train[i]) is used to find the index of the maximum value in the i-th element of the y_train array. This index corresponds to the predicted class label for the i-th image in the dataset.

Regarding the blueish color tone in the plots, it is likely because the default colormap used by imshow in matplotlib is 'viridis', which is a colormap that ranges from blue to yellow. This colormap is used to represent the intensity values of the image pixels. However, without the complete code, it is difficult to determine the exact reason for the blueish color tone. It could also be due to specific image processing or normalization applied to the images before plotting.

In [None]:
meta["lesion_type_short"]

In [None]:
from sklearn.utils.class_weight import compute_class_weight
y_id = np.array(meta["lesion_type_short"])

# compute weights for the loss function, because the problem is unbalanced
class_weights = np.around(compute_class_weight(class_weight='balanced',classes=np.unique(y_id),y=y), 2)
class_weights = dict(zip(np.unique(y_id),class_weights))

print('The problem is unbalanced. We need to provide class_weights ')
print(class_weights)

# The Model

# CNN in VGG style

In [1]:
import keras
from keras.utils import plot_model
from keras.layers import Conv2D, MaxPooling2D, Dense, Input, Activation, Dropout, GlobalAveragePooling2D, \
    BatchNormalization, concatenate, AveragePooling2D
from keras.optimizers import Adam
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.layers.core import Dropout, Activation
from keras.layers import Conv2D, BatchNormalization, MaxPool2D, Flatten, Dense, Input


input_layer = Input(shape=(100, 100, 3))

deep = Conv2D(filters=64, kernel_size=(7, 7), strides=(2, 2), activation='relu')(input_layer)
deep = BatchNormalization()(deep)
deep = MaxPool2D(pool_size=(3, 3), strides=(2, 2))(deep)

deep = Conv2D(filters=128, kernel_size=(3, 3), strides=(2, 2), activation='relu', padding="same")(deep)
deep = BatchNormalization()(deep)
deep = MaxPool2D(pool_size=(3, 3), strides=(1, 1))(deep)

deep = Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding="same")(deep)
deep = BatchNormalization()(deep)
deep = MaxPool2D(pool_size=(3, 3), strides=(1, 1))(deep)

deep = Conv2D(filters=512, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding="same")(deep)
deep = BatchNormalization()(deep)

deep = Conv2D(filters=512, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding="same")(deep)
deep = BatchNormalization()(deep)
deep = MaxPool2D(pool_size=(3, 3), strides=(1, 1))(deep)

deep = Conv2D(filters=512, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding="same")(deep)
deep = BatchNormalization()(deep)
deep = MaxPool2D(pool_size=(3, 3), strides=(1, 1))(deep)

deep = Flatten()(deep)

deep = Dense(4096, activation='relu')(deep)
deep = Dropout(0.5)(deep)

deep = Dense(4096, activation='relu')(deep)
deep = Dropout(0.5)(deep)

output = Dense(7, activation='softmax')(deep)

model = Model(inputs=input_layer, outputs=output)

model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 100, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 47, 47, 64)        9472      
                                                                 
 batch_normalization (BatchN  (None, 47, 47, 64)       256       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 23, 23, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 12, 12, 128)       73856     
                                                                 
 batch_normalization_1 (Batc  (None, 12, 12, 128)      512   

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

early_stopping_monitor = EarlyStopping(patience=100, monitor='val_accuracy')
model_checkpoint_callback = ModelCheckpoint(filepath='CNN.h5',
                                            save_weights_only=False,
                                            monitor='val_accuracy',
                                            mode='auto',
                                            save_best_only=True,
                                            verbose=1)
batch_size = 32
epochs = 100
optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-3)
model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])

datagen = ImageDataGenerator(zoom_range = 0.2, horizontal_flip=True, shear_range=0.2)

datagen.fit(X_train)

history=model.fit(datagen.flow(X_train,y_train), epochs=epochs,
                  batch_size=batch_size, shuffle=True,
                  callbacks=[early_stopping_monitor,
                             model_checkpoint_callback],
                  validation_data=(X_test, y_test), class_weight=class_weights)

In [None]:

train_loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(range(1, len(train_loss) + 1), train_loss, label="Training Loss")
plt.plot(range(1, len(val_loss) + 1), val_loss, label="Validation Loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
#plt.title('Loss Curves')
plt.legend()
plt.show()


In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
y_pred = model.predict(X_test)

total = 0
accurate = 0
accurateindex = []
wrongindex = []

for i in range(len(y_pred)):
    if np.argmax(y_pred[i]) == np.argmax(y_test[i]):
        accurate += 1
        accurateindex.append(i)
    else:
        wrongindex.append(i)

    total += 1

print('Total-test-data;', total, '\taccurately-predicted-data:', accurate, '\t wrongly-predicted-data: ', total - accurate)

print('Accuracy:', round(accurate/total*100, 3), '%')

In [None]:
# compute predictions
y_pred_prob = np.around(model.predict(X_test),3)
y_pred = np.argmax(y_pred_prob,axis=1)

y_test2 = np.argmax(y_test,axis=1)

plt.figure(figsize=(16,16))
for i in range(16):
    plt.subplot(4,4,i+1)
    index = i+100
    plt.imshow(X_test[index,:,:,::-1])
    label_exp = lesion_names[y_test2[index]]  #expected label
    label_pred = lesion_names[y_pred[index]]  #predicted label
    label_pred_prob = round(np.max(y_pred_prob[index])*100)
    plt.title('Expected:'+str(label_exp)+'\n Pred.:'+str(label_pred)+' ('+str(label_pred_prob)+'%)')
plt.ylabel('')
plt.tight_layout()
plt.savefig('final_figure.png',dpi=300)
plt.show()

In [None]:
# manually saving the model
model.save('CNN.h5',save_format='h5')