In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.image import imread #Read Images

import cv2 as cv
import pathlib
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from PIL import Image
from keras.models import load_model
from keras.layers import Resizing
import tensorflow as tf
from tensorflow.keras.utils import image_dataset_from_directory as tfks


2024-05-06 20:02:39.834723: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 20:02:39.834897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 20:02:40.035074: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
root_path= '../input/jpeg-isic2019-512x512/'
meta =pd.read_csv(root_path+'train.csv')
#meta.info()   #[DISPLAY]

In [3]:
meta["path_jpeg"]=root_path+'train/'+meta["image_name"]+".jpg" #Add Image Path to the data frame
meta["target"].value_counts(normalize=True) #Non-Melanoma vs Melanoma (Normalized for proportional representation)

target
0    0.821484
1    0.178516
Name: proportion, dtype: float64

In [4]:
X=meta["path_jpeg"]
Y=meta["benign_malignant"]
X_use, X_test, Y_use, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0, stratify=Y) #Split into testing data and data that will be used
#for training/validation. Stratify adds class labels
X_train, X_val, Y_train, Y_val = train_test_split(X_use, Y_use, test_size=0.25, random_state=0, stratify=Y_use)
#Training data and validation set

#Then recombine the training sets into a single pandas Dataframe
training_DF=pd.concat([X_train,Y_train],axis=1) #Axis determines concatenation direction
#training_DF   #[DISPLAY]

In [5]:
#Repeat for the validation set
validation_DF=pd.concat([X_val,Y_val],axis=1)
#and for the testing set
testing_DF=pd.concat([X_test,Y_test],axis=1)

#Test that they have the same distribution as Meta, 2 cells above
print("--Meta--")
meta["target"].value_counts(normalize=True)

--Meta--


target
0    0.821484
1    0.178516
Name: proportion, dtype: float64

In [6]:
#For Training
print("--Training--")
training_DF["benign_malignant"].value_counts(normalize=True)

--Training--


benign_malignant
benign       0.821464
malignant    0.178536
Name: proportion, dtype: float64

In [7]:
#For Testing
print("--Testing--")
testing_DF["benign_malignant"].value_counts(normalize=True)

--Testing--


benign_malignant
benign       0.821579
malignant    0.178421
Name: proportion, dtype: float64

In [8]:
#For Testing
print("--Validation--")
validation_DF["benign_malignant"].value_counts(normalize=True)

--Validation--


benign_malignant
benign       0.821475
malignant    0.178525
Name: proportion, dtype: float64

In [9]:
files = [f for f in os.listdir("/kaggle/input/jpeg-isic2019-512x512/train")]
files.sort()


In [10]:
baseDir="/kaggle/input/jpeg-isic2019-512x512/train/"


def rgb2grey(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140]) #Convert to greyscale

def getFlattened(n):
    return rgb2grey(imread(baseDir+n)).flatten()[::16].copy() #Every 16 pieces of data (512*512/16 or 128*128)
datamap=map(getFlattened,files)
data=np.fromiter(datamap,dtype=np.dtype((int, 128*128))) #Converted to 128*128
#Add to dataset and flatten
with open('flattenedData.npy', 'wb') as f:
    np.save(f,data)


In [11]:
#Split the labels
def labelSymbolise(x):
    if x=="benign":
        return 0
    return 1

def get_labels(dataf):
    vfunc=np.vectorize(labelSymbolise)
    return vfunc(np.array(dataf["benign_malignant"]))
train_labels = get_labels(training_DF)
val_labels=get_labels(validation_DF)
test_labels=get_labels(testing_DF)
#for i in range(100):
    #print(a[i])

In [12]:
#Split the images
def get_filename(n):
    return baseDir+n
fileList=list(map(get_filename,files))
print(fileList[0])
#Identical
print(fileList[0][len(fileList[0])-11:])

/kaggle/input/jpeg-isic2019-512x512/train/ISIC_0000000.jpg
0000000.jpg


In [13]:
def find(l, s): #Find substrings in list
    for i in range(len(l)):
        if l[i].find(s)!=-1:
            return i
    return None # Or -1

def concat_ToStringID(x):
    search_term=str(x[42:49]) #7-digit Identifier
    index = find(fileList,search_term)
    return index
def get_wantedIndices(dataf):
    vfunc=np.vectorize(concat_ToStringID)
    return vfunc(np.array(dataf["path_jpeg"]))

testing_images = data[get_wantedIndices(testing_DF)]
training_images = data[get_wantedIndices(training_DF)]
validation_images = data[get_wantedIndices(validation_DF)]



In [14]:
#Print how many of each we have
print("Number of Training Images : " +str(training_images.shape[0]))
print("Number of Testing Images : " +str(testing_images.shape[0]))
print("Number of Validation Images : " +str(validation_images.shape[0]))
#Reshape into 128*128
def shapen(inDat):
    print(inDat)
    return np.reshape(inDat,(-1,128,128))
vfunc=np.vectorize(shapen)
testing_images=np.reshape(testing_images,(-1,128,128,1)) #Converts it to 128*128 NPy Array
training_images=np.reshape(training_images,(-1,128,128,1))
validation_images=np.reshape(validation_images,(-1,128,128,1))#Has to have a fourth dimension ('1') for the keras Convolution model
print("Testing Images Shape : "+str(testing_images.shape))
print("Training Images Shape : "+str(training_images.shape))
print("Validation Images Shape : "+str(validation_images.shape))

Number of Training Images : 16148
Number of Testing Images : 3800
Number of Validation Images : 5383
Testing Images Shape : (3800, 128, 128, 1)
Training Images Shape : (16148, 128, 128, 1)
Validation Images Shape : (5383, 128, 128, 1)


In [15]:
#Create the Keras Sequential Model
import keras
from keras import layers
from keras import ops

#Use a sequential layer, Conv2D followed by MaxPooling2D downsamples - repeat and then use dense layer to get an output
model = keras.Sequential(name="Convolution_Model") #Empty Keras Model that we will add to incrementally.


model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128,  1))) #128*128*1 is the input shape [128px *128px * 1 colour channel]
model.add(layers.MaxPooling2D((2, 2))) #Downsamples #Creates an output shape based upon the stride and pool size 
model.add(layers.Conv2D(32, (4, 4), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu')) #In dense layers, every neuron is connected to the neuron in the previous layer
model.add(layers.Dense(1, activation='sigmoid'))
model.summary() # Get the summary of the model

#Layers Documentation used
#https://keras.io/api/layers/pooling_layers/max_pooling2d/
#https://keras.io/api/layers/core_layers/dense/

#Activation Functions
# ReLu -> input>0
# Sigmoid -> 1/(1+e^-x) [Creates an S shape]

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy', 'Recall', 'AUC'])
training_results = model.fit(training_images,train_labels,epochs=30,batch_size=32,validation_data=(validation_images, val_labels)) #Train the model and store the training results


  super().__init__(


Epoch 1/30
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 498ms/step - AUC: 0.5789 - Recall: 0.0631 - accuracy: 0.8071 - loss: 1.7578 - val_AUC: 0.7029 - val_Recall: 0.0968 - val_accuracy: 0.8309 - val_loss: 0.4265
Epoch 2/30
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 485ms/step - AUC: 0.6803 - Recall: 0.1732 - accuracy: 0.8282 - loss: 0.4325 - val_AUC: 0.7122 - val_Recall: 0.1072 - val_accuracy: 0.8313 - val_loss: 0.4287
Epoch 3/30
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 522ms/step - AUC: 0.6990 - Recall: 0.1809 - accuracy: 0.8324 - loss: 0.4231 - val_AUC: 0.7325 - val_Recall: 0.1686 - val_accuracy: 0.8395 - val_loss: 0.4098
Epoch 4/30
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 488ms/step - AUC: 0.6983 - Recall: 0.1885 - accuracy: 0.8325 - loss: 0.4260 - val_AUC: 0.7269 - val_Recall: 0.2664 - val_accuracy: 0.8142 - val_loss: 0.4271
Epoch 5/30
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [16]:
#Evaluate the model against the testing data.
model.evaluate(x=testing_images, y=test_labels)

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 224ms/step - AUC: 0.6872 - Recall: 0.3587 - accuracy: 0.8305 - loss: 0.6197


[0.6394062638282776,
 0.8276315927505493,
 0.33480826020240784,
 0.6780419945716858]

In [17]:
#Or alternatively, predict the testing data
predictions = model.predict(testing_images)

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 140ms/step
