# Train VOLVuLuS incl. data augmentation
Train a 3D U-net model.

## Set seeds and import packages

In [1]:
RANDOM_STATE = 42
from numpy.random import seed
seed(RANDOM_STATE)

from tensorflow import set_random_seed
set_random_seed(RANDOM_STATE)

import random
random.seed = RANDOM_STATE

import VOLVuLuS_settings as settings

from dltoolkit.utils.generic import model_architecture_to_file, model_summary_to_file, list_images
from dltoolkit.nn.segment import UNet_3D_NN
from dltoolkit.utils.visual import plot_training_history, plot_roc_curve, plot_precision_recall_curve,\
    print_confusion_matrix, print_classification_report
from dltoolkit.iomisc import HDF5Generator_Segment

from thesis_common import convert_img_to_pred_3D, convert_pred_to_img_3D, create_hdf5_db_3D,\
    show_image, read_images, read_groundtruths, print_training_info
from thesis_metric_loss import dice_coef, weighted_pixelwise_crossentropy_loss

from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

import numpy as np
import os, cv2, time, progressbar

import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Change how TensorFlow allocates GPU memory

In [2]:
import tensorflow as tf
from keras import backend as k

# Don't pre-allocate memory; allocate as-needed
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
 
# Only allow a percentage of the GPU memory to be allocated
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
 
# Create a session with the above options specified
k.tensorflow_backend.set_session(tf.Session(config=config))

## Convert training set to HDF5

In [3]:
def perform_hdf5_conversion_3D(settings):
    """Convert the training and test images, ground truths and masks to HDF5 format. The assumption is that images
    are all placed in the same folder, regardless of the patient.
    """
    output_paths = []

    print("training images")
    # Convert training images in each sub folder to a single HDF5 file
    output_paths.append(create_hdf5_db_3D(os.path.join(settings.TRAINING_PATH, settings.FLDR_IMAGES),
                                        (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                        img_exts=".jpg", key=settings.HDF5_KEY, ext=settings.HDF5_EXT,
                                        settings=settings))

    print("training ground truths")
    # Training ground truths
    output_paths.append(create_hdf5_db_3D(os.path.join(settings.TRAINING_PATH, settings.FLDR_GROUND_TRUTH),
                                        (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                        img_exts=".jpg", key=settings.HDF5_KEY, ext=settings.HDF5_EXT,
                                        settings=settings, is_mask=True))

    # Do the same for the test images
    print("test images")
    output_paths.append(create_hdf5_db_3D(os.path.join(settings.TEST_PATH, settings.FLDR_IMAGES),
                                        (settings.IMG_HEIGHT, settings.IMG_WIDTH, settings.IMG_CHANNELS),
                                        img_exts=".jpg", key=settings.HDF5_KEY, ext=settings.HDF5_EXT,
                                        settings=settings))

    return output_paths

## Enable/disable cross-validation

In [4]:
USE_KFOLD_CV = False

In [5]:
# Convert image files to HDF5
if settings.IS_DEVELOPMENT:
    print("\n--- Converting images to HDF5")
    hdf5_paths = perform_hdf5_conversion_3D(settings)
    
    if settings.TRN_TRAIN_VAL_SPLIT == 0:
        print("Not creating a validation set")
    else:
        print("Creating a {} training/validation set".format(settings.TRN_TRAIN_VAL_SPLIT))
else:
    # During development avoid performing HDF5 conversion for every run
        hdf5_paths = ["../data/MSC8002/training_3d/images.h5",
                      "../data/MSC8002/training_3d/groundtruths.h5",
                      ]

Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database 100% |###################################| Time: 0:00:00
Creating HDF5 database 100% |###################################| Time: 0:00:00



--- Converting images to HDF5
training images
training ground truths
test images
Not creating a validation set


## Class distribution

In [None]:
# Print class distribution
class_weights = [settings.CLASS_WEIGHT_BACKGROUND, settings.CLASS_WEIGHT_BLOODVESSEL]
print("Class distribution: {}".format(class_weights))

## Create the 3D U-Net model

In [None]:
unet = UNet_3D_NN(img_height=settings.IMG_HEIGHT,
                  img_width=settings.IMG_WIDTH,
                  num_slices=settings.SLICE_END - settings.SLICE_START,
                  img_channels=settings.IMG_CHANNELS,
                  num_classes=settings.NUM_CLASSES)
# model = unet.build_model()
model = unet.build_model_3lyr()

## Create paths

In [None]:
# Prepare some path strings
model_path = os.path.join(settings.MODEL_PATH, "VOLVuLuS_" + unet.title + "_ep{}.model".format(settings.TRN_NUM_EPOCH))
summ_path = os.path.join(settings.OUTPUT_PATH, "VOLVuLuS_" + unet.title + "_model_summary.txt")
csv_path = os.path.join(settings.OUTPUT_PATH, "VOLVuLuS_" + unet.title + "_training_ep{}_bs{}.csv".format(settings.TRN_NUM_EPOCH,
                                                                                            settings.TRN_BATCH_SIZE))

## Save/print model architecture information

In [None]:
model.summary()
model_summary_to_file(model, summ_path)
model_architecture_to_file(unet.model, settings.OUTPUT_PATH + "VOLVuLuS_" + unet.title)

## Compile the model

In [None]:
# Set the optimiser, loss function and metrics
opt = Adam()
metrics = [dice_coef]
loss = weighted_pixelwise_crossentropy_loss(class_weights)

# Compile
model.compile(optimizer=opt, loss=loss, metrics=metrics)

## Load data

### Option 1: Use generators

### Option 2: Load all data into memory

In [None]:
train_imgs = read_images(hdf5_paths[0], settings.HDF5_KEY, is_3D=True)
train_grndtr = read_groundtruths(hdf5_paths[1], settings.HDF5_KEY, is_3D=True)
train_grndtr_ext_conv = convert_img_to_pred_3D(train_grndtr, settings.NUM_CLASSES, settings.VERBOSE)

print("Number of samples: {}".format(len(train_imgs)))

## Train the model

### Train WITH a validation set - ALL IN MEMORY

### Train WITHOUT a validation set - ALL IN MEMORY

In [None]:
if not USE_KFOLD_CV:
    print("Training with just a training set, no validation set used.")
    print_training_info(unet, model_path, train_imgs.shape, None, settings, class_weights, 1, opt, loss)
    print("Training start:\n")
    
    # Prepare callbacks
    callbacks = [
        ModelCheckpoint(model_path, monitor="loss", mode="min", save_best_only=True, verbose=1),
        EarlyStopping(monitor='loss', min_delta=0, patience=settings.TRN_EARLY_PATIENCE, verbose=0, mode="auto"),
        CSVLogger(csv_path, append=False),
        ]

    # Fit the model using a training set only
    start_time = time.time()
    hist = model.fit(train_imgs, train_grndtr_ext_conv,
                     epochs=settings.TRN_NUM_EPOCH,
                     batch_size=settings.TRN_BATCH_SIZE,
                     verbose=2,
                     shuffle=True,
                     callbacks=callbacks)

    print("\n\nElapsed training time: {:.2f} min".format(int((time.time() - start_time))/60))
else:
    print("Cross-validation selected, not performing one-off training with just a training set")

## Plot the results

In [None]:
if not USE_KFOLD_CV:
    plot_training_history(hist,
                          show=False,
                          save_path=settings.OUTPUT_PATH + unet.title,
                          time_stamp=True,
                          metric="dice_coef")
else:
    print("Using cross-validation, no training history saved")

## Perform pipeline test

In [None]:
# Read images and ground truths
train_imgs = read_images(hdf5_paths[0], settings.HDF5_KEY, is_3D=True)
train_grndtr = read_groundtruths(hdf5_paths[1], settings.HDF5_KEY, is_3D=True)

# For pipeline testing only
predictions = model.predict(train_imgs, batch_size=settings.TRN_BATCH_SIZE, verbose=2)

# Transpose images and ground truths to the correct oder
train_imgs = np.transpose(train_imgs, axes=(0, 3, 1, 2, 4))
train_grndtr = np.transpose(train_grndtr, axes=(0, 3, 1, 2, 4))

# predictions = predictions
predictions_imgs = convert_pred_to_img_3D(predictions,
                                       threshold=settings.TRN_PRED_THRESHOLD,
                                       verbose=settings.VERBOSE)

show_image(np.squeeze(train_imgs[0, 0]), 'PRED TRAIN org image')
show_image(np.squeeze(train_grndtr[0, 0]), 'PRED TRAIN org ground truth')
show_image(np.squeeze(predictions_imgs[0, 0]), 'PRED TRAIN predicted mask')

print("  original {} dtype {}".format(np.max(train_imgs[0,0]), train_imgs[0,0].dtype))
print("  gr truth {} dtype {}".format(np.max(train_grndtr[0,0]), train_grndtr[0,0].dtype))
print("prediction {} dtype {}".format(np.max(predictions_imgs[0,0]), predictions[0,0].dtype))