In [1]:
import zipfile
import os

# Define paths
zip_path_1 = '/content/drive/MyDrive/HAM10000 dir/HAM10000_images_part_1.zip'
zip_path_2 = '/content/drive/MyDrive/HAM10000 dir/HAM10000_images_part_2.zip'
extract_path = '/content/'
output_folder_1 = os.path.join(extract_path, 'HAM10000 PT1')
output_folder_2 = os.path.join(extract_path, 'HAM10000 PT2')

# Create output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Function to unzip a file to a specific directory
def unzip_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Unzip the first file into the first output folder
unzip_file(zip_path_1, output_folder_1)

# Unzip the second file into the second output folder
unzip_file(zip_path_2, output_folder_2)

print(f"Files from {zip_path_1} have been extracted to {output_folder_1}")
print(f"Files from {zip_path_2} have been extracted to {output_folder_2}")



Files from /content/drive/MyDrive/HAM10000 dir/HAM10000_images_part_1.zip have been extracted to /content/HAM10000 PT1
Files from /content/drive/MyDrive/HAM10000 dir/HAM10000_images_part_2.zip have been extracted to /content/HAM10000 PT2


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

import os
print(os.listdir("../content/drive/MyDrive/HAM10000 dir"))


['HAM10000_images_part_1.zip', 'HAM10000_images_part_2.zip', 'Ham10000 metadata csv.csv', 'Ham10000 metadata.gsheet', 'HAM 10000 metadata.csv']


In [7]:
# Define the path to your CSV file
csv_file_path = '/content/HAM10000_metadata'

# Read the CSV file with the correct delimiter
df = pd.read_csv(csv_file_path, delimiter=',')
#read metadata
df.head()


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [8]:
from os.path import isfile
from PIL import Image as pil_image
df['num_images'] = df.groupby('lesion_id')["image_id"].transform("count")

classes = df['dx'].unique()
labeldict = {}
for num, name in enumerate(classes):
    labeldict[name] = num
df['dx_id'] = df['dx'].map(lambda x: labeldict[x])


def expand_path(p):
    if isfile('../content/HAM10000 PT1/' + p + '.jpg'): return '../content/HAM10000 PT1/' + p + '.jpg'
    if isfile('../content/HAM10000 PT2/' + p + '.jpg'): return '../content/HAM10000 PT2/' + p + '.jpg'
    return p
df['image_path'] = df['image_id']
df['image_path'] = df['image_path'].apply(expand_path)


df['images'] = df['image_path'].map(lambda x: np.asarray(pil_image.open(x).resize((150,112))))
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,num_images,dx_id,image_path,images
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,2,0,../content/HAM10000 PT1/ISIC_0027419.jpg,"[[[188, 151, 193], [193, 156, 198], [192, 155,..."
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,2,0,../content/HAM10000 PT1/ISIC_0025030.jpg,"[[[24, 13, 22], [24, 13, 22], [24, 14, 25], [2..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,2,0,../content/HAM10000 PT1/ISIC_0026769.jpg,"[[[186, 126, 135], [189, 131, 142], [192, 136,..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,2,0,../content/HAM10000 PT1/ISIC_0025661.jpg,"[[[23, 11, 16], [24, 11, 19], [26, 13, 22], [3..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,2,0,../content/HAM10000 PT2/ISIC_0031633.jpg,"[[[129, 87, 109], [139, 94, 117], [148, 102, 1..."


In [11]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

lesion_ID_dict = {
    'nv': 0,
    'mel': 1,
    'bkl': 2,
    'bcc': 3,
    'akiec': 4,
    'vasc': 5,
    'df': 6
}

lesion_names = ['Melanocytic nevi','Melanoma','Benign keratosis-like lesions ',
               'Basal cell carcinoma','Actinic keratoses','Vascular lesions',
               'Dermatofibroma']

lesion_names_short = ['nv','mel','bkl','bcc','akiec','vasc','df']

df['lesion_type']=df['dx'].map(lesion_type_dict)
df['lesion_ID'] = df['dx'].map(lesion_ID_dict)

print('Total number of images',len(df))
print('The problem is unbalanced, since Melanocytic nevi is much more frequent that other labels')

df['lesion_type'].value_counts()

Total number of images 10015
The problem is unbalanced, since Melanocytic nevi is much more frequent that other labels


lesion_type
Melanocytic nevi                  6705
Melanoma                          1113
Benign keratosis-like lesions     1099
Basal cell carcinoma               514
Actinic keratoses                  327
Vascular lesions                   142
Dermatofibroma                     115
Name: count, dtype: int64

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import class_weight

In [20]:
# Split the data into training, validation, and test sets
X = np.stack(df['images'].values)
y = to_categorical(df['dx_id'], num_classes=len(df['dx'].unique()))

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print('Train dataset shape',X_train.shape)
print('Test dataset shape',X_test.shape)

Train dataset shape (7010, 112, 150, 3)
Test dataset shape (1503, 112, 150, 3)


In [22]:
# Calculate class weights to handle class imbalance
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['dx_id']),
    y=df['dx_id']
)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

print('The problem is unbalanced. We need to provide class_weights ')
print(class_weights)

The problem is unbalanced. We need to provide class_weights 
{0: 1.301832835044846, 1: 0.21338020666879728, 2: 12.440993788819876, 3: 1.2854575792581184, 4: 10.075452716297788, 5: 2.78349082823791, 6: 4.375273044997815}


In [23]:
# Data Augmentation
train_datagen = ImageDataGenerator(
    zoom_range=0.2,
    horizontal_flip=True,
    shear_range=0.2,
    rotation_range=45,
    width_shift_range=0.2,
    height_shift_range=0.2
)

validation_datagen = ImageDataGenerator()

train_gen = train_datagen.flow(X_train, y_train, batch_size=32, shuffle=True)
validation_gen = validation_datagen.flow(X_val, y_val, batch_size=32, shuffle=False)

In [28]:
# Load ResNet50 model pre-trained on ImageNet and exclude the top layers
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(112, 150, 3))

# Adding custom layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(len(np.unique(df['dx'])), activation='softmax')(x)

# Create the model
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the layers of ResNet50 except the last 4 layers
for layer in base_model.layers[:-4]:
    layer.trainable = False

# Model Summary
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 112, 150, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 118, 156, 3)          0         ['input_4[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 56, 75, 64)           9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 56, 75, 64)           256       ['conv1_conv[0][0]']          
 on)                                                                                        

In [None]:
# Optimizer
optimizer = Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping_monitor = EarlyStopping(patience=10, monitor='val_accuracy', restore_best_weights=True)
model_checkpoint_callback = ModelCheckpoint(filepath='best_model.h5', save_weights_only=False, monitor='val_accuracy', mode='auto', save_best_only=True, verbose=1)
lr_scheduler = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, verbose=1, min_lr=1e-6)

# Train the model
history = model.fit(
    train_gen,
    epochs=40,
    validation_data=validation_gen,
    callbacks=[early_stopping_monitor, model_checkpoint_callback, lr_scheduler],
    steps_per_epoch=len(X_train) // 32,
    validation_steps=len(X_val) // 32,
    class_weight=class_weights
)


In [None]:
# Evaluate the model on the test set
test_gen = validation_datagen.flow(X_test, y_test, batch_size=32, shuffle=False)
score = model.evaluate(test_gen, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])