# Train DECiSION incl. data augmentation

## Set seeds and import packages

In [1]:
RANDOM_STATE = 42
from numpy.random import seed
seed(RANDOM_STATE)

from tensorflow import set_random_seed
set_random_seed(RANDOM_STATE)

import random
random.seed = RANDOM_STATE

import DECiSION_settings as settings

from dltoolkit.utils.generic import model_architecture_to_file, model_summary_to_file, list_images
from dltoolkit.utils.image import normalise, normalise_single, standardise, standardise_single,\
    mean_subtraction, clahe_equalization, clahe_equalization_single, adjust_gamma_single
from dltoolkit.nn.segment import UNet_NN
from dltoolkit.utils.visual import plot_training_history
from dltoolkit.iomisc import HDF5Writer, HDF5Reader, HDF5Generator_Segment

from thesis_common import convert_img_to_pred, convert_pred_to_img,\
    group_images, show_image, read_preprocess_image, read_preprocess_groundtruth
from thesis_metric_loss import dice_coef, weighted_pixelwise_crossentropy_loss

from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

import numpy as np
import os, cv2, time, progressbar

import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Convert to HDF5

In [2]:
def create_hdf5_db(imgs_list, dn_name, img_path, img_shape, key, ext, settings, is_mask=False):
    """
    Create a HDF5 file using a list of paths to individual images to be written to the data set
    :param imgs_list: list of image paths
    :param dn_name: becomes part of the HDF5 file name
    :param img_path: path to the location of the `images` and `groundtruths` subfolders
    :param img_shape: shape of the images being written to the data set
    :param key: key to use for the data set
    :param ext: extension of the HDF5 file name
    :param settings: holds settings
    :param is_mask: True if masks are being written, False if not
    :return: the full path to the HDF5 file
    """
    # Construct the name of the database
    tmp_name = dn_name + ("_masks" if is_mask else "_imgs")
    output_path = os.path.join(os.path.dirname(img_path), tmp_name) + ext
    print(output_path)

    # Prepare the HDF5 writer, which expects a label vector. Because this is a segmentation problem just pass None
    # hdf5_writer = HDF5Writer((len(imgs_list), img_shape[0], img_shape[1], img_shape[2]), output_path,
    hdf5_writer = HDF5Writer(((len(imgs_list),) + img_shape),
                             output_path=output_path,
                             feat_key=key,
                             label_key=None,
                             del_existing=True,
                             buf_size=len(imgs_list),
                             dtype_feat=np.float32 if not is_mask else np.uint8
                             )
    # Prepare for CLAHE histogram equalization
    clahe = cv2.createCLAHE(clipLimit=2, tileGridSize=(16, 16))

    # Loop through all images
    widgets = ["Creating HDF5 database ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval=len(imgs_list), widgets=widgets).start()
    for i, img in enumerate(imgs_list):
        image = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
        
        # Crop to the region of interest
        image = image[settings.IMG_CROP_HEIGHT:image.shape[0]-settings.IMG_CROP_HEIGHT,
                      settings.IMG_CROP_WIDTH:image.shape[1]-settings.IMG_CROP_WIDTH]

        # Apply pre-processing
        if is_mask:
            # Apply binary thresholding to ground truth masks
            _, image = cv2.threshold(image, settings.MASK_BINARY_THRESHOLD, settings.MASK_BLOODVESSEL, cv2.THRESH_BINARY)

            # Convert to the format produced by the model
            # image = convert_img_to_pred(np.array([image]), settings, settings.VERBOSE)
        else:
            # Apply preprocessing to images (not to ground truth masks)
            # Apply CLAHE histogram equalization
            image = clahe.apply(image)

            # Normalise between -0.5 and 0.5
            image = (image/255.0-0.5).astype(np.float32)

        # Reshape from (height, width) to (height, width, 1)
        image = image.reshape((img_shape[0], img_shape[1], img_shape[2]))

        hdf5_writer.add([image], None)
        pbar.update(i)

    pbar.finish()
    hdf5_writer.close()

    return output_path


def perform_hdf5_conversion(settings):
    # Prepare the path to the training images and ground truths
    img_exts = ".jpg"
    img_path = os.path.join(settings.TRAINING_PATH, settings.FLDR_IMAGES)
    msk_path = os.path.join(settings.TRAINING_PATH, settings.FLDR_GROUND_TRUTH)
    test_path = os.path.join(settings.TEST_PATH, settings.FLDR_IMAGES)

    # Create a list of paths to the individual patient folders
    patient_fld_imgs = sorted([os.path.join(img_path, e.name) for e in os.scandir(img_path) if e.is_dir()])
    patient_fld_masks = sorted([os.path.join(msk_path, e.name) for e in os.scandir(msk_path) if e.is_dir()])
    test_imgs = sorted(list(list_images(basePath=test_path, validExts=img_exts)))

    # Obtain a list of paths to the training images and ground truths for each patient
    img_list = []
    msk_list = []
    for patient_ix, (p_fld_imgs, p_fld_masks) in enumerate(zip(patient_fld_imgs, patient_fld_masks)):
        img_list.extend(sorted(list(list_images(basePath=p_fld_imgs,
                                                validExts=img_exts)))
                        [settings.SLICE_START:settings.SLICE_END])
        msk_list.extend(sorted(list(list_images(basePath=p_fld_masks,
                                                validExts=img_exts)))
                        [settings.SLICE_START:settings.SLICE_END])

    assert(len(img_list) == len(msk_list))

    # Split the training set into a training and validation set
    train_img, val_img, train_msk, val_msk = train_test_split(img_list, msk_list,
                                                              test_size=settings.TRN_TRAIN_VAL_SPLIT,
                                                              random_state=settings.RANDOM_STATE,
                                                              shuffle=True)
    
    # Create the HDF5 data sets
    output_paths = []

    # Training images
    output_paths.append(create_hdf5_db(train_img, "train", img_path,
                                       (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                       key=settings.HDF5_KEY, ext=settings.HDF5_EXT, settings=settings))

    # Training ground truths
    output_paths.append(create_hdf5_db(train_msk, "train", msk_path,
                                       (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                       key=settings.HDF5_KEY, ext=settings.HDF5_EXT, settings=settings,
                                       is_mask=True))

    # Validation images
    output_paths.append(create_hdf5_db(val_img, "val", img_path,
                                       (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                       key=settings.HDF5_KEY, ext=settings.HDF5_EXT, settings=settings))

    # Validation ground truths
    output_paths.append(create_hdf5_db(val_msk, "val", msk_path,
                                       (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                       key=settings.HDF5_KEY, ext=settings.HDF5_EXT, settings=settings,
                                       is_mask=True))

    # Test images (no ground truths available, no need to split). The assumption is only
    # relevant images are placed in the test folder, i.e. the pipeline will not exclude
    # any slices
    output_paths.append(create_hdf5_db(test_imgs, "test", test_path,
                                        (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                        key=settings.HDF5_KEY, ext=settings.HDF5_EXT, settings=settings))

    return output_paths

In [3]:
# Convert image files to HDF5
if settings.IS_DEVELOPMENT:
    print("\n--- Converting images to HDF5")
    hdf5_paths = perform_hdf5_conversion(settings)
else:
    # During development avoid performing HDF5 conversion for every run
    hdf5_paths = ["../data/MSC8002/training/train_imgs.h5",
                  "../data/MSC8002/training/train_masks.h5",
                  "../data/MSC8002/training/val_imgs.h5",
                  "../data/MSC8002/training/val_masks.h5"
                  "../data/MSC8002/test/test_imgs.h5"
                  ]

Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database  30% |##########                         | ETA:  0:00:00


--- Converting images to HDF5
Check train data: ../data/MSC8002/training/images/patient_1/S572080069.jpg = ../data/MSC8002/training/groundtruths/patient_1/S57208Filter0069.jpg
  Check val data: ../data/MSC8002/training/images/patient_1/S572080067.jpg = ../data/MSC8002/training/groundtruths/patient_1/S57208Filter0067.jpg
Num train: 9, num val: 1
../data/MSC8002/training/train_imgs.h5
../data/MSC8002/training/train_masks.h5
../data/MSC8002/training/val_imgs.h5
../data/MSC8002/training/val_masks.h5
../data/MSC8002/test/test_imgs.h5


Creating HDF5 database 100% |###################################| Time: 0:00:00


## Class distribution

In [4]:
# Print class distribution
class_weights = [settings.CLASS_WEIGHT_BACKGROUND, settings.CLASS_WEIGHT_BLOODVESSEL]
print("Class distribution: {}".format(class_weights))

Class distribution: [1.0, 10.0]


## Create U-Net model

In [5]:
# Instantiate the U-Net model
unet = UNet_NN(img_height=settings.IMG_HEIGHT,
               img_width=settings.IMG_WIDTH,
               img_channels=settings.IMG_CHANNELS,
               num_classes=settings.NUM_CLASSES)

# model = unet.build_model_sigmoid()
# model = unet.build_model_flatten()
model = unet.build_model_softmax()

## Create paths

In [6]:
# Prepare some path strings
model_path = os.path.join(settings.MODEL_PATH, "DECiSION_" + unet.title + "_ep{}.model".format(settings.TRN_NUM_EPOCH))
summ_path = os.path.join(settings.OUTPUT_PATH, "DECiSION_" + unet.title + "_model_summary.txt")
csv_path = os.path.join(settings.OUTPUT_PATH, "DECiSION_" + unet.title + "_training_ep{}_bs{}.csv".format(settings.TRN_NUM_EPOCH,
                                                                                            settings.TRN_BATCH_SIZE))

## Save/print model architecture information

In [7]:
# Print the architecture to the console, a text file and an image
model.summary()
model_summary_to_file(model, summ_path)
model_architecture_to_file(unet.model, settings.OUTPUT_PATH + "DECiSION_" + unet.title)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 240, 240, 1)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 240, 240, 32) 320         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 240, 240, 32) 9248        conv2d_1[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 120, 120, 32) 0           conv2d_2[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (

## Compile the model

In [8]:
# Set the optimiser, loss function and metrics
opt = Adam()
metrics = [dice_coef]
loss = weighted_pixelwise_crossentropy_loss(class_weights)

# Compile
model.compile(optimizer=opt, loss=loss, metrics=metrics)

## Prepare data generators

In [9]:
# Training set generator using data augmentation
rdr_train = HDF5Generator_Segment(hdf5_paths[0], hdf5_paths[1],
                                  batch_size=settings.TRN_BATCH_SIZE,
                                  num_classes=settings.NUM_CLASSES,
                                  converter=convert_img_to_pred, 
#                                   data_gen_args=data_gen_args,
                                  feat_key=settings.HDF5_KEY)
gen_train = rdr_train.generator(num_epochs=settings.TRN_NUM_EPOCH)

# Validation set generator (does NOT use data augmentation)
rdr_val = HDF5Generator_Segment(hdf5_paths[2], hdf5_paths[3],
                                batch_size=settings.TRN_BATCH_SIZE,
                                num_classes=settings.NUM_CLASSES,
                                converter=convert_img_to_pred, 
                                feat_key=settings.HDF5_KEY)
gen_val = rdr_val.generator(num_epochs=settings.TRN_NUM_EPOCH)

print(" num trn samples: {}".format(rdr_train.num_images()))
print(" num val samples: {}".format(rdr_val.num_images()))

print(" steps_per_epoch: {}".format(rdr_train.num_images()/settings.TRN_BATCH_SIZE))
print("validation_steps: {}".format(rdr_val.num_images()/settings.TRN_BATCH_SIZE))

 num trn samples: 9
 num val samples: 1
 steps_per_epoch: 9.0
validation_steps: 1.0


## Train WITH a validation set

In [None]:
# Prepare callbacks
callbacks = [ModelCheckpoint(model_path, monitor="val_loss", mode="min", save_best_only=True, verbose=1),
             EarlyStopping(monitor='val_loss',
                           min_delta=0,
                           patience=settings.TRN_EARLY_PATIENCE,
                           verbose=0,
                           mode="auto"),
             CSVLogger(csv_path, append=False),
             ]

# Fit the model using generators and a validation set
start_time = time.time()
hist = model.fit_generator(gen_train,
                 epochs=settings.TRN_NUM_EPOCH,
                 steps_per_epoch=rdr_train.num_images()/settings.TRN_BATCH_SIZE,
                 verbose=2,
                 validation_data=gen_val,
                 validation_steps=rdr_val.num_images()/settings.TRN_BATCH_SIZE,
                 shuffle=True,
                 callbacks=callbacks)

print("\n\nElapsed training time: {} min".format(int((time.time() - start_time))/60))

## Train WITHOUT a validation set

In [10]:
# Prepare callbacks
callbacks = [
    ModelCheckpoint(model_path, monitor="loss", mode="min", save_best_only=True, verbose=1),
    EarlyStopping(monitor='loss', min_delta=0, patience=settings.TRN_EARLY_PATIENCE, verbose=0, mode="auto"),
    CSVLogger(csv_path, append=False),
    ]


# Fit the model using a training set only
start_time = time.time()
hist = model.fit_generator(gen_train,
                 epochs=settings.TRN_NUM_EPOCH,
                 steps_per_epoch=rdr_train.num_images()/settings.TRN_BATCH_SIZE,
                 verbose=2,
                 shuffle=True,
                 callbacks=callbacks)

print("\n\nElapsed training time: {} min".format(int((time.time() - start_time))/60))

Epoch 1/100
 - 7s - loss: 33376.1421 - dice_coef: 0.6436

Epoch 00001: loss improved from inf to 33376.14214, saving model to ../savedmodels/DECiSION_UNet_brain_softmax_ep100.model
Epoch 2/100
 - 6s - loss: 16312.0703 - dice_coef: 0.9033

Epoch 00002: loss improved from 33376.14214 to 16312.07031, saving model to ../savedmodels/DECiSION_UNet_brain_softmax_ep100.model
Epoch 3/100
 - 6s - loss: 14171.6432 - dice_coef: 0.9211

Epoch 00003: loss improved from 16312.07031 to 14171.64323, saving model to ../savedmodels/DECiSION_UNet_brain_softmax_ep100.model
Epoch 4/100
 - 6s - loss: 12697.4178 - dice_coef: 0.9364

Epoch 00004: loss improved from 14171.64323 to 12697.41775, saving model to ../savedmodels/DECiSION_UNet_brain_softmax_ep100.model
Epoch 5/100
 - 7s - loss: 11241.8515 - dice_coef: 0.9412

Epoch 00005: loss improved from 12697.41775 to 11241.85145, saving model to ../savedmodels/DECiSION_UNet_brain_softmax_ep100.model
Epoch 6/100
 - 6s - loss: 9668.0025 - dice_coef: 0.9487

Epoch 

## Plot the results

In [None]:
plot_training_history(hist,
                      settings.TRN_NUM_EPOCH,
                      show=False,
                      save_path=settings.OUTPUT_PATH + unet.title,
                      time_stamp=True,
                      metric="dice_coef")

## Perform pipeline test

In [None]:
# Read images and ground truths
train_imgs = read_preprocess_image(hdf5_paths[0], settings.HDF5_KEY)
train_grndtr = read_preprocess_groundtruth(hdf5_paths[1], settings.HDF5_KEY)

# For pipeline testing only, predict on one training image
predictions = model.predict(train_imgs[[0]], batch_size=settings.TRN_BATCH_SIZE, verbose=2)

# predictions = predictions
# predictions = convert_pred_to_img_flatten(predictions, settings.TRN_PRED_THRESHOLD)
predictions = convert_pred_to_img(predictions, settings, settings.TRN_PRED_THRESHOLD)


show_image(np.squeeze(train_imgs[0]), 'PRED TRAIN org image')
show_image(np.squeeze(train_grndtr[0]), 'PRED TRAIN org ground truth')
show_image(np.squeeze(predictions[0]), 'PRED TRAIN predicted mask')

print("  original {} dtype {}".format(np.max(train_imgs[0]), train_imgs[0].dtype))
print("  gr truth {} dtype {}".format(np.max(train_grndtr[0]), train_grndtr[0].dtype))
print("prediction {} dtype {}".format(np.max(predictions[0]), predictions[0].dtype))