In [10]:
#import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob
from PIL import Image
from tqdm import tqdm
from keras.preprocessing import image
%matplotlib inline
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Flatten
import tensorflow as tf
from tensorflow import keras

In [None]:
#create function to add image path location in metadata
def end_to_end_data_prep():
    base = '/cxldata/skin_disease_1/'
    metadata = pd.read_csv(os.path.join(base,'HAM10000_metadata_orig.csv'))
    #metadata.info()
    #if we combine data into one directory then no need to mention *
    image_path = {os.path.splitext(os.path.basename(x))[0]: x
                  for x in glob(os.path.join('/cxldata/skin_disease_1/HAM10000_images_draftv1/orig/*.jpg'))}
    metadata['path'] = metadata['image_id'].map(image_path.get)
    #upload data into dataset with resize 254,254
    #metadata['image'] = metadata['path'].map(lambda x: np.asarray(Image.open(x).resize((224,224))))
    
    return metadata

In [None]:
# this is for to plot images w.r.t. each category
def end_to_end_data_prep_plot(metadata):
    n_samples = 5  # number of samples for plotting
    # Plotting
    fig, m_axs = plt.subplots(7, n_samples, figsize = (4*n_samples, 3*7))
    for n_axs, (type_name, type_rows) in zip(m_axs, metadata.sort_values(['dx']).groupby('dx')):
        n_axs[0].set_title(type_name)
        for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=1234).iterrows()):
            c_ax.imshow(c_row['image'])
            c_ax.axis('off')

In [None]:
#preparation for the augmentation
def data_prep_augmentation(path):
    img = load_img(path, target_size= (224,224))
    x = img_to_array(img)
    x = x.reshape((1,) + x.shape)
    return x


In [None]:
#augmentation image generator
def data_prep_augmentation_generator(input_image_array,output_dir,output_prefix):
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')
    i = 0
    for batch in datagen.flow(input_image_array, batch_size=1,save_to_dir=output_dir, save_prefix=output_prefix, save_format='jpg'):
        i += 1
        if i > 10:
            break

In [None]:
metadata = end_to_end_data_prep()
end_to_end_data_prep_plot(metadata)

In [None]:
# train test split for data preparation
train,test = train_test_split(metadata,stratify=metadata['dx'],test_size=0.2)

In [None]:
# first import all data into dataframe then run train_test_split function to split data

# total samples are 42650
# split with train_test_split (stratify = y,test 0.2 size)
# train  - 34120
#test - 8530

In [None]:
# save train data into csv to overcome processing again n again( 34120 records)
train.to_csv('/cxldata/skin_disease_1/sudeep/sudeep_train.csv')

In [None]:
# save test data into csv to overcome processing again n again ( 8530 records)

test.to_csv('/cxldata/skin_disease_1/sudeep/sudeep_test.csv')

In [2]:
# import train data (34120 records)
import pandas as pd
train_df = pd.read_csv('/cxldata/skin_disease_1/sudeep/sudeep_train.csv')

In [4]:
train_df.shape

(34120, 11)

In [3]:
# import test data ( 8530 records)
import pandas as pd 
test_df = pd.read_csv('/cxldata/skin_disease_1/sudeep/sudeep_test.csv')

In [5]:
test_df.shape

(8530, 11)

In [6]:
# created another df because we are not able to run fit model on 34120 records so created another df with 12000 recods
train_df1 = train_df[0:12000]

In [7]:
train_df1.shape

(12000, 11)

In [11]:
#converting images into numpy array and adding 1 extra dimension(batch) for CNN
train_image = []
for i in tqdm(range(train_df1.shape[0])):
    img = image.load_img(train_df1['path'][i],target_size=[64,64,3])
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)

X = np.array(train_image)

100%|██████████| 12000/12000 [03:45<00:00, 53.16it/s]


In [12]:
#applying encoder for y
from sklearn.preprocessing import OneHotEncoder
y = train_df1['dx'].values
onehotencoder = OneHotEncoder(categories='auto',sparse=False)
y = onehotencoder.fit_transform(y.reshape(-1,1))

In [13]:
X.shape

(12000, 64, 64, 3)

In [14]:
y.shape

(12000, 7)

In [15]:
#create train and valid dataset
X_train,X_valid,y_train,y_valid = train_test_split(X,y,stratify=y,test_size=0.2)

In [16]:
print(X_train.shape)
print(X_valid.shape)

(9600, 64, 64, 3)
(2400, 64, 64, 3)


In [17]:
print(y_train.shape)
print(y_valid.shape)

(9600, 7)
(2400, 7)


In [None]:
#importing existing model VGG16
from keras.models import Sequential
from keras.layers import Dense,Flatten
import tensorflow as tf
from tensorflow import keras
base_model = keras.applications.vgg16.VGG16(weights="imagenet",
                                                  include_top=False,input_shape = (64,64,3))
model_t1 = Sequential()
model_t1.add(base_model)
model_t1.add(Flatten())
model_t1.add(Dense(64,activation="relu"))
model_t1.add(Dense(7,activation="softmax"))
#avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
#output = keras.layers.Dense(7, activation="softmax")(avg)
#model = keras.models.Model(inputs=base_model.input, outputs=output)

In [None]:
for layer in base_model.layers:
    layer.trainable = False

In [None]:
#optimizer = keras.optimizers.SGD(lr=0.2)
model_t1.compile(loss="categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model_t1.fit(X_train,y_train,epochs=3,validation_data=(X_valid,y_valid)) #changed from 5

In [None]:
model_t1.save('/cxldata/skin_disease_1/sudeep/1/4_nadam.h5')

In [20]:
# for test dataset
train_image = []
for i in tqdm(range(test_df.shape[0])):
    img = image.load_img(test_df['path'][i],target_size=[64,64,3])
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)

X_test = np.array(train_image)

100%|██████████| 8530/8530 [02:37<00:00, 54.23it/s]


In [48]:
from sklearn.preprocessing import OneHotEncoder
y_test = test_df['dx'].values
onehotencoder = OneHotEncoder(categories='auto',sparse=False)
y_test = onehotencoder.fit_transform(y_test.reshape(-1,1))

In [18]:
model = keras.models.load_model('/cxldata/skin_disease_1/sudeep/1/4_nadam.h5')

In [22]:
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.3455950915813446, 0.9089097380638123]


In [23]:
y_pred=model.predict(X_test, batch_size=128)

In [None]:
# prediction for single image
classes = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
img_path1 = '/cxldata/skin_disease_1/HAM10000_images_draftv1/akiec2/akiec_0_9864.jpg'
img1 = image.load_img(img_path1,color_mode='rgb', target_size=(64, 64))
display(img1)
x1 = image.img_to_array(img1)
x1.shape
# Adding the fouth dimension, for number of images
x1 = np.expand_dims(x1, axis=0)

x1 = preprocess_input(x1)
features1 = model_t1.predict(x1)
features1
#p = decode_predictions(features)
MaxPosition=np.argmax(features1)  
prediction_label=classes[MaxPosition]
print(prediction_label) 

In [49]:
import numpy as np
rounded_labels=np.argmax(y_test, axis=1)


In [40]:
rounded_labels[1000]

5

In [50]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(rounded_labels, y_pred)
cm

array([[1038,   10,   28,   10,    8,   17,   10],
       [  13,  924,   16,   19,    3,   45,   11],
       [  14,   12, 1093,   15,   11,   96,   12],
       [   8,   14,   15, 1194,    4,   14,    7],
       [   8,    9,   41,    6, 1112,  109,   17],
       [   6,   13,   45,   19,   46, 1182,   30],
       [   2,    2,    2,    3,    0,    7, 1210]])

In [52]:
from sklearn.metrics import classification_report
print(classification_report(rounded_labels, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1121
           1       0.94      0.90      0.92      1031
           2       0.88      0.87      0.88      1253
           3       0.94      0.95      0.95      1256
           4       0.94      0.85      0.89      1302
           5       0.80      0.88      0.84      1341
           6       0.93      0.99      0.96      1226

    accuracy                           0.91      8530
   macro avg       0.91      0.91      0.91      8530
weighted avg       0.91      0.91      0.91      8530

