In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
from IPython.display import display, Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout

import datetime
import os
import gc

## Load the tabular data

In [3]:
df_train = pd.read_csv('data/train.csv')

In [4]:
df_train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [None]:
targets= df_train.target

In [None]:
filenames = ['data/jpeg/train/' + fname + '.jpg' for fname in df_train.image_name]

## Prepration of Training and Validation splits

In [None]:
# Targets are in numeric format but our images are still just file paths.
X = filenames
y = pd.get_dummies(targets)

In [None]:
# Split the dataset to train and validation sets
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Converting the data into tensors suitable for TensorFlow

In [None]:
# Define image size
IMG_SIZE = 256
def process_image(img_path):
  """
  Takes an image file path and turns it into a Tensor.
  """
  image = tf.io.read_file(img_path)
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image

In [None]:
def create_df(image, target):
  """
  Creates the dataset tensors.
  """
    image = process_image(image)
    return image, target

In [None]:
# Number of images to be loded in each loop
BATCH_SIZE = 32

# Create training and validation data batches
train_data = tf.data.Dataset.from_tensor_slices((tf.constant(x_train), tf.constant(y_train)))
train_data = train_data.map(create_df).batch(BATCH_SIZE).shuffle(buffer_size=len(x_train))

val_data = tf.data.Dataset.from_tensor_slices((tf.constant(x_val), tf.constant(y_val)))
val_data = val_data.map(create_df).batch(BATCH_SIZE)

## Calculating the weights for the unbalanced data

In [None]:
# The dataset is heavily unbalanced. Therefore it is essential to calculate the weight of each class

Total_num_img = len(df_train)
Num_mlg = np.count_nonzero(df_train.target)
Num_bng = Total_num_img - Num_mlg

Weight_bng = 1/Num_bng * Total_num_img/2
Weight_mlg = 1/Num_mlg * Total_num_img/2

class_weights = {0: Weight_bng, 1: Weight_mlg}

## Creating and training a model

In [None]:
# Incase a pre-trained model is required to be loaded
def load_model(model_path):
  """
  Loads a saved model from a specified path.
  """
  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path)
  return model

In [None]:
# Size of the input, hidden layer and output of the neural network
INPUT_SIZE = [None, IMG_SIZE,IMG_SIZE, 3]
HIDDEN_SIZE = 128
OUTPUT_SIZE = 2

In [None]:
def create_model(retrain = False):
    model = []
    if retrain:
        files = os.listdir('models')
        paths = [os.path.join('models/', basename) for basename in files]
        model_path = max(paths, key=os.path.getctime)
        model = load_model(model_path)
        model.summary()
    else:
        model = Sequential([
                    Conv2D(64, kernel_size = 5, padding = 'VALID', activation = 'relu'),
                    MaxPooling2D(pool_size = 5),

                    Conv2D(128, kernel_size = 5, padding = 'VALID', activation = 'relu'),
                    MaxPooling2D(pool_size = 5),
                    
                    Conv2D(256, kernel_size = 5, padding = 'VALID', activation = 'relu'),
                    MaxPooling2D(pool_size = 5),
                    
                    Flatten(),
                    Dense(HIDDEN_SIZE,activation='relu'),
                    Dense(OUTPUT_SIZE, activation='softmax')])

        model.build(INPUT_SIZE)

        model.compile(
                    optimizer = tf.keras.optimizers.SGD(lr = 1e-2),
                    loss = tf.keras.losses.BinaryCrossentropy(),
                    metrics = ['val_AUC'])

        model.summary()
    return model

In [None]:
model = create_model(False)

In [None]:
# Creating callbacks
EarlyStopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 4)
Rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=0, mode='auto', cooldown=0, min_lr=1e-6)

In [None]:
# How many rounds should we get the model to look through the data?
NUM_EPOCHS = 100 
gc.collect()

In [None]:
# Train the model
history = model.fit(train_data, epochs=NUM_EPOCHS, validation_data=val_data, validation_freq=1, callbacks=[EarlyStopping, Rate], class_weight = class_weights)

In [None]:
plt.plot(history.history['val_loss'], color = 'red')
plt.plot(history.history['loss'], color = 'blue')
plt.legend(['Validation', 'Training'])
plt.title('Loss')
plt.show()

## Save the results

In [None]:
def save_model(model, suffix=None):
  """
  Saves a given model in a models directory and appends a suffix (str)
  for clarity and reuse.
  """
  # Create model directory with current time
  modeldir = os.path.join("models",
                          datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
  model_path = modeldir + "-" + suffix + ".h5" # save format of model
  print(f"Saving model to: {model_path}...")
  model.save(model_path)
  return model_path

In [None]:
sfx = "Images_CNN"
save_model(model, suffix=sfx)