## MELANOMA DETECTION

In [None]:
'''
IMPORTING THE BASIC LIBRARIES
NUMPY : FOR SOME CALCULATIONS
MATPLOTLIB, SEABORN, PIL : FOR SOME VISUALISATIONS
PANDAS: TO TAKE THE RAW DATA (WHICH IS IN THE .csv FORMAT)
KERAS: FOR IMAGE AUGMENTATIONS
TENSORFLOW: FOR THE MODEL MAKING AND TRAINING
SKLEARN: FOR SPLITTING THE DATASETS TO TRAIN, TEST SETS AND FOR SOME PERFORMANCE METRICS AND THE CONFUSION MATRIX

'''

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import pandas as pd
import datetime

import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import array_to_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, BatchNormalization, Dropout, Dense, MaxPool2D
from keras.callbacks import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
'''
THERE ARE TOTAL OF 7 DIFFERENT KIND OF CLASSES PRESENT HERE WHICH AE TO BE CLASSIFIED

0: nv - Melanocytic nevi
1: mel - Melanoma
2: bkl - Benign keratosis-like lesions
3: bcc - Basal cell carcinoma
4: akiec - Actinic keratoses and intraepithelial carcinoma / Bowen's disease
5: vasc - Vascular lesions
6: df - Dermatofibroma

VISUALISING THE DATA DISTRIBUTIONS
THE DATA IS NOT MUCH EQUALLY DISTRIBUTED 
SO ACCURACY IS NOT ONLY THE METRICS TO BE CONSIDERED SO THE METRICS TO BE CONSIDERED HERE ARE AS FOLLOWS

ACCURACY
AUC
F1 SCORE

ALSO WITH THE PERFORMANCE METRICS IS ALSO REQUIRED IN ORDER TO MAKE TO SEE THE CLASSIFICATION REPORT

'''


raw_data = pd.read_csv('../input/skin-cancer-mnist-ham10000/hmnist_28_28_RGB.csv')
raw_data = raw_data.sample(frac = 1)
data = raw_data.iloc[:,:-1]
labels = raw_data.iloc[:,-1:]

type_cancer = ['akiec','df','bkl','mel','nv','vasc','bcc']
counts = list(labels.value_counts())
plt.figure(figsize = (8,6))
plt.xlabel("Dataset Lesions")
plt.ylabel("Number of Images")
sns.barplot(x = type_cancer, y = counts)

In [None]:
'''
NOTE: 
AS THE DATASET IS VERY MUCH IMBALANCED AS THE CLASS 'akiec' IS HAVING THE A UNWANTED HIGH FREQUENCY THAN OTHERS, WHICH MAKES THE 
TRAINING PROCESS A MORE BIASED TOWARDS THE THAT CLASS THAN OTHERS AND I HAVE TRAINED WITH THIS DATA AND I HAVE SEEN THAT THE f1-score 
OF THE MODEL IS MORE FOR THE 'akiec' CLASS (LIKE 0.87) BUT VERY LESS FOR OTHER CLASSES (LIKE 0.47 OR LESS THAN THAT) 

SOLUTION:
SO WHAT CAN BE DONE IS THAT IF WE CAN DUPLICATE THE OTHER CLASSES DATA AND THEN EXTRACT THOSE IMAGE AND IF WE RANDOMLY 
AUGMENT THE DATA WITH KERAS ImageDataGenarator THEN WE CAN GET SOME WHAT DIFFERENT IMAGES FOR THE DIFFERENT CLASSES 
THIS SOLUTION REALLY WORKED WELL FOR THE TEST DATA AS WELL. 

SO THE STEPS ARE SIMPLE:
DUPLICATE THE CLASS OF THE DATA FOR SOME AMOUNT OF TIMES FOR e.g.

THE CLASS CALLED "df" WILL BE DUPLICATED 17 TIMES THE PREVIOUS
AND SIMILAR LIKE OTHER CLASSES

I HAVE TAKEN SOME OPTIMAL VALUE SUCH THAT THE DISTRIBUTION OF THE TOTAL DATA IS SOMEWHAT CENTRALIZED IN ORDER TO GET THE WORK DONE
COOL THEN LETS START TO WORK

'''

raw_data = pd.read_csv('../input/skin-cancer-mnist-ham10000/hmnist_28_28_RGB.csv')
raw_data = raw_data.sort_values('label')
raw_data = raw_data.reset_index()

index0 = raw_data[raw_data['label'] == 0].index.values
index1 = raw_data[raw_data['label'] == 1].index.values
index2 = raw_data[raw_data['label'] == 2].index.values
index3 = raw_data[raw_data['label'] == 3].index.values
index5 = raw_data[raw_data['label'] == 5].index.values
index6 = raw_data[raw_data['label'] == 6].index.values

df_index0 = raw_data.iloc[int(min(index0)):int(max(index0)+1)]
df_index1 = raw_data.iloc[int(min(index1)):int(max(index1)+1)]
df_index2 = raw_data.iloc[int(min(index2)):int(max(index2)+1)]
df_index3 = raw_data.iloc[int(min(index3)):int(max(index3)+1)]
df_index5 = raw_data.iloc[int(min(index5)):int(max(index5)+1)]
df_index6 = raw_data.iloc[int(min(index6)):int(max(index6)+1)]


df_index0 = df_index0.append([df_index0]*17, ignore_index = True)
df_index1 = df_index1.append([df_index1]*15, ignore_index = True)
df_index2 = df_index2.append([df_index2]*5, ignore_index = True)
df_index3 = df_index3.append([df_index3]*52, ignore_index = True)
df_index5 = df_index5.append([df_index5]*45, ignore_index = True)
df_index6 = df_index6.append([df_index6]*5, ignore_index = True)

frames = [raw_data, df_index0, df_index1, df_index2, df_index3, df_index5, df_index6]

final_data = pd.concat(frames)
final_data.drop('index', inplace = True, axis = 1)
final_data = final_data.sample(frac = 1)
data = final_data.iloc[:,:-1]
labels = final_data.iloc[:,-1:]

In [None]:
'''
NOW WE CAN SEE THAT THE DISTRIBUTION OF THE CLASSES ARE MORE NORMALIZED THAN BEFORE
AND NOT BIASED FOR ONE SPECIFIC CLASSES
'''


type_cancer = ['akiec','df','bkl','mel','nv','vasc','bcc']
counts = list(labels.value_counts())
plt.figure(figsize = (8,6))
plt.xlabel("Dataset Lesions")
plt.ylabel("Number of Images")
sns.barplot(x = type_cancer, y = counts)

In [None]:
'''
MAKING THE FEATURES AND THE LABELS OUT OF THE RAW DATA
WHERE:
X : FEATURE
Y : LABELS

NOW RESHAPING THE DATA ON THE BASIS OF THE THREE COLOR CHANNELS
AS THE IMAGES ARE IN THE FORM OF THE RGB FORMAT

SO THERE ARE:
10015 NO OF SAMPLES OF PICTURES
WHERE EACH PICTURE IS HAVING A LENGTH AND WIDTH OF 28 WITH THREE CHANNELS

'''

X = np.array(data)
Y = np.array(labels)

# reshaping the data

X = X.reshape(-1,28,28,3)

print("SHAPE OF X IS: ", X.shape)
print("SHAPE OF Y IS: ", Y.shape)

In [None]:
'''
VISUALISING THE PLOTS OF THE IMAGES WHERE EACH OF THE IMAGES ARE LABELED WITH 
THE CORRESPONDING LABELS OF THE IMAGES i.e. WHICH TYPE OF THE CACNCER IT IS

WHERE:

nv - Melanocytic nevi
mel - Melanoma
bkl - Benign keratosis-like lesions
bcc - Basal cell carcinoma
akiec - Actinic keratoses and intraepithelial carcinoma / Bowen's disease
vasc - Vascular lesions
df - Dermatofibroma

'''

def visualisePlots(X,Y, rows, columns):
    class_dicts = {
        0: 'nv',
        1: 'mel',
        2: 'bkl',
        3: 'bcc',
        4: 'akiec',
        5: 'vasc',
        6: 'df', 
    }
    
    data = []
    target = []

    for i in range(rows*columns):
        data.append(X[i])
        target.append(Y[i])

    width = 11
    height = 11
    fig = plt.figure(figsize=(11,11))
    for i in range(columns*rows):
        temp_img = array_to_img(data[i])
        fig.add_subplot(rows, columns, i+1)
        plt.imshow(temp_img)
        plt.xticks([])
        plt.yticks([])
        plt.title(str(class_dicts[target[i][0]]))
    plt.show()
    
# using the above function

visualisePlots(X,Y, 7,7)

### GETTING THE FEARURES FROM DATA, MAKING THE MODEL, TRAINING THE MODEL

In [None]:
'''
SPLITTING THE DATA INTO TRAIN AND THE TESTING DATA
WHERE WE WILL SPLIT THE DATA INTO 20% OF TESTING DATA AND 80% OF THE TRAINING DATA
ALSO THE DATA IS STANDARDISED BEFORE THE SPLIT OF THE DATA

AFTER THE SPLIT OF THE DATA, IT IS ALSO BEEN AUGMENTATED WITH THE KERAS 
THE AUGMENTED DATA IS THEN TRAINED 
AS WE NOW AUGMENT THE DATA SO NOW THE DUPLICATES ONES WILL BE NOT 100% SILMILAR TO THE ORIGINAL DATA
THUS WE CAN TREAT THEM AS OUR TRAINABLE DATA
SO IT'S TIME TO MAKE THE MODEL AND TRAIN IT

'''

X = (X-np.mean(X))/np.std(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

train_datagen = ImageDataGenerator(rescale = 1./255,
                                  rotation_range = 10,
                                  width_shift_range = 0.2,
                                  height_shift_range = 0.2,
                                  shear_range = 0.2,
                                  horizontal_flip = True,
                                  vertical_flip = True,
                                  fill_mode = 'nearest')
train_datagen.fit(X_train)

test_datagen = ImageDataGenerator(rescale = 1./255)
test_datagen.fit(X_test)

train_data = train_datagen.flow(X_train, Y_train, batch_size = 64)
test_data = test_datagen.flow(X_test, Y_test, batch_size = 64)

In [None]:
'''
MODEL MAKING
CONV2D (16 FILTERS) WITH PADDING AS THE IMAGES IS VERY SMALL SO THERE IS CHANCE TO GET LESS QUALITY FEATURES
CONV2D (32 FILTERS) WITH PADDING AS THE IMAGES IS VERY SMALL SO THERE IS CHANCE TO GET LESS QUALITY FEATURES
CONV2D (64 FILTERS) 
FLATTEN
DENSE (64)
DENSE (64)
DENSE (32)
DENSE (7 ) FINAL LAYER

OPTIMIZER = ADAM
LOSS = SPARSE CATEGORICAL CROSS ENTROPY
CALL BACKS: NONE

'''

model = Sequential()
model.add(Conv2D(16, kernel_size = (3,3), input_shape = (28, 28, 3), activation = 'relu', padding = 'same'))
model.add(Conv2D(32, kernel_size = (3,3), activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2)))

model.add(Conv2D(32, kernel_size = (3,3), activation = 'relu', padding = 'same'))
model.add(Conv2D(64, kernel_size = (3,3), activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2), padding = 'same'))

model.add(Conv2D(64, kernel_size = (3,3), activation = 'relu'))
model.add(Conv2D(64, kernel_size = (3,3), activation = 'relu', padding = 'same'))
model.add(MaxPool2D(pool_size = (2,2), padding = 'same'))

model.add(Flatten())

model.add(Dense(64, activation = 'relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(7, activation='softmax'))


learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.00075,
                                    beta_1 = 0.9,
                                    beta_2 = 0.999,
                                    epsilon = 1e-8)

model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = optimizer,
              metrics = ['accuracy'])

print(model.summary())

In [None]:
### TRAINING THE MODEL

start_model = datetime.datetime.now()

history = model.fit(X_train,
                    Y_train,
                    validation_split=0.2,
                    batch_size = 64,
                    epochs = 20,
                    callbacks=[learning_rate_reduction])

end_model = datetime.datetime.now()

### VISUALIZING THE DIFFERENT PERFORMANCE MEASURE OF THE MODEL (ACC, LOSS CURVES | CONFUSION MATRIX | CLASSIFICATION REPORT)

In [None]:
ACC = history.history['accuracy']
VAL_ACC = history.history['val_accuracy']

plt.figure(figsize=(8,6))
plt.title("THE ACCURACY OF THE TRAINING AND VALIDATION PHASE OF THE MODEL")
plt.plot(ACC, label = 'train_acc')
plt.plot(VAL_ACC, label = 'val_acc')
plt.legend()

LOSS = history.history['loss']
VAL_LOSS = history.history['val_loss']

plt.figure(figsize=(8,6))
plt.title("THE LOSS OF THE TRAINING AND VALIDATION PHASE OF THE MODEL")
plt.plot(LOSS, label = 'train_loss')
plt.plot(VAL_LOSS, label = 'val_loss')
plt.legend()

In [None]:
Y_true = np.array(Y_test)

Y_pred = model.predict(X_test)
Y_pred = np.array(list(map(lambda x: np.argmax(x), Y_pred)))

cm1 = confusion_matrix(Y_true, Y_pred)
plt.figure(figsize=(12, 6))
plt.title('####  THE CONFUSION MATRIX OF THE MODEL WITH TESTING DATA ####')
sns.heatmap(cm1, annot = True, fmt = 'g' ,vmin = 0, cmap = 'Blues')

In [None]:
def visualisePlots_test(X,Y, model, rows, columns):
    class_dicts = {
        0: 'nv',
        1: 'mel',
        2: 'bkl',
        3: 'bcc',
        4: 'akiec',
        5: 'vasc',
        6: 'df', 
    }
    
    data = []
    target = []
    
    Y_pred = model.predict(X)
    Y_pred = np.array(list(map(lambda x: np.argmax(x), Y_pred)))

    for i in range(rows*columns):
        data.append(X[i])
        target.append(Y[i])

    width = 12
    height = 12
    fig = plt.figure(figsize=(12,12))
    for i in range(columns*rows):
        temp_img = array_to_img(data[i])
        fig.add_subplot(rows, columns, i+1)
        plt.imshow(temp_img)
        plt.xticks([])
        plt.yticks([])
        plt.title(str(class_dicts[target[i][0]]) + " || " + str(class_dicts[Y_pred[i]]))
    plt.show()
    
# USING THE MODELS AND VISUALISING THEM

print('THE PLOTS TESTING WITH THE MODEL')
visualisePlots_test(X_test,Y_test, model, 7, 7) 

In [None]:
label_mapping = {
    0: 'nv',
    1: 'mel',
    2: 'bkl',
    3: 'bcc',
    4: 'akiec',
    5: 'vasc',
    6: 'df'
}

classification_report_model = classification_report(Y_true, Y_pred, target_names=label_mapping.values())
print(classification_report_model)

In [None]:
time_model = end_model - start_model
print("TIME TAKEN BY MODEL : ", time_model)


model_acc_test = model.evaluate(X_test, Y_test, verbose=0)[1]
print("TEST ACCURACY OF MODEL: {:.3f}%".format(model_acc_test * 100))