In [None]:
# import necessary libraries

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import train_test_split

import time

print(tf.__version__)

2.3.0


In [None]:
# define how many gpus are available and set a memmory limit
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPUs Available: ", len(gpus))
for i in range(len(gpus)):
    tf.config.experimental.set_virtual_device_configuration(gpus[i], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=7900)]) 

Number of GPUs Available:  1


In [None]:
strategy = tf.distribute.MirroredStrategy()
# the number of replicas that is created by the strategy should be equal to the number of GPU's available
print ('Number of synchronized replicas created: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of synchronized replicas created: 1


In [None]:
# read in train and test data in case Google DRIVE is used
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Basepath for Google DRIVE:
Basepath = '/content/drive/My Drive/Stage_ENT_Studios_2/Data/Kaggle/Arrays_5GB_float32/'

# Basepath for Jupyter notebooks:
# Basepath = 'C:/Users/lunam/Documents/1steMaster/Stage/Data_FinalArrays/Kaggle/Array_10GB/'
# also a version with 5GB and 20GB

# Basepath for KILI
# Basepath = '/home/kili/Desktop/Data_FinalArrays/Kaggle/Arrays/'

# train data
train_images = np.load(Basepath + 'train_images_Final.npy')
print('Shape train images: {}'.format(train_images.shape))

train_labels =  np.load(Basepath + 'train_labels_Final.npy')
print('Shape train labels: {}'.format(train_labels.shape))

train_images, test_images, train_labels, test_labels = train_test_split(train_images, train_labels, test_size=0.33, random_state=42)

Shape train images: (324, 256, 256, 3)
Shape train labels: (324, 2)
Shape test images: (156, 256, 256, 3)
Shape test labels: (156, 2)


In [None]:
# path to save the model and the tensorboard logs

# Basepath for Google DRIVE:
base_path = '/content/drive/My Drive/Stage_ENT_Studios_2/DR_Grading/Logs/'

# Basepath for jupyter notebooks:
# base_path = 'C:/Users/lunam/Documents/1steMaster/Stage/Code_Final/DR_classification/DeepLearningClassification/ResNet18_GPU/Logs/'

# direction where the tensorboard files will be stored
log_dir_tens = base_path + 'Tensorboard_Logs/'
# direction where the trained models will be stored
log_dir_model = base_path + 'Trained_Model/'


Some information on the Identity Block:

The input and output of the identity block have the same dimensions.
    
The shortcut path consists of the identity function.
    
The main path consists of 3 convolutional layers each time followed by a batch normalization.
The first two convolutional layers are also followed by a Relu activation function.
    
At the end the shortcut path and main path are brought together and a Relu activation is added. 
    
n_filters = number of filters for the first layer of the main path
dropout can be added to avoid overfitting, the dropout rate can be defined
    
The function returns an output with the same dimensions as the input




Some information on the Convolutional Block:

Input and output of the convolutional block don't have the same dimensions.
    
Shortcut path consists of a convolutional layer followed by a batch normalization
    
Main path consists of 3 convolutional layers each time followed by a batch normalization.
The first two convolutional layers are also followed by a Relu activation function.
    
At the end the shortcut path and main path are brought together and a Relu activation is added. 
    
Input Image = the input to this identity block.
n_filters = a list with 3 values indicating the numer of filters used in the three convolutional layers of the main path
the amount of filters used in the third layer of the main path equals the amount of filters used in the shortcut path
dropout and maxpooling can be added to avoid overfitting and to make the amount of parameters smaller (the dropout rate can also be defined)
    
The function returns an output image with a width and height that are half of the width and height of the input image

In [None]:
def Identity_Block(Input_Image, n_filters, dropout_prob = 0):


    # initialization of the weights
    W_init = tf.initializers.GlorotUniform()
    
    # Shortcut path: Identity function
    Shortcut_Image = Input_Image
    
    # Main path, Main_Image represents the image that is passed through different layers
    Main_Image = Input_Image

    Main_Image = tf.keras.layers.Conv2D(n_filters, (3,3), kernel_initializer= W_init, padding = 'same')(Main_Image) # padding is needed to keep the same dimensions
    Main_Image = tf.keras.layers.BatchNormalization(axis = 1)(Main_Image)
    Main_Image = tf.nn.relu(Main_Image)

    Main_Image = tf.keras.layers.Conv2D(n_filters, (3,3), kernel_initializer= W_init, padding = 'same')(Main_Image)
    
    # Output: Relu activation from (Shortcut path + Main path)
    Output_Image = tf.keras.layers.Add()([Main_Image, Shortcut_Image])

    if dropout_prob != 0:
        Output_Image = tf.keras.layers.Dropout(rate = dropout_prob)(Output_Image)

    Output_Image = tf.nn.relu(Output_Image)
    
    return Output_Image

In [None]:
def Convolutional_Block(Input_Image, n_filters, dropout_prob = 0, pooling = False):
    

    # initialization of the weights
    W_init = tf.initializers.GlorotUniform()

    # Shortcut path
    Shortcut_Image = Input_Image
    Shortcut_Image = tf.keras.layers.Conv2D(n_filters, (3,3), strides = (2,2), kernel_initializer= W_init)(Shortcut_Image) # stride leads to a reduction in size
    Shortcut_Image = tf.keras.layers.BatchNormalization(axis = 1)(Shortcut_Image)

    # Main path
    Main_Image = Input_Image
    
    Main_Image = tf.keras.layers.Conv2D(n_filters, (3,3), strides = (2,2), kernel_initializer= W_init)(Main_Image)
    Main_Image = tf.keras.layers.BatchNormalization(axis = 1)(Main_Image)
    Main_Image = tf.nn.relu(Main_Image)

    Main_Image = tf.keras.layers.Conv2D(n_filters, (3,3), kernel_initializer= W_init, padding = 'same')(Main_Image) # padding is needed to keep the same dimensions
    Main_Image = tf.keras.layers.BatchNormalization(axis = 1)(Main_Image)


    # Output: Relu activation from (Shortcut path + Main path)
    Output_Image = tf.keras.layers.Add()([Main_Image, Shortcut_Image])

    if pooling:
        Output_Image = tf.keras.layers.MaxPool2D((2,2))(Output_Image)

    if dropout_prob != 0:
        Output_Image = tf.keras.layers.Dropout(rate = dropout_prob)(Output_Image)

    Output_Image = tf.nn.relu(Output_Image)
    
    return Output_Image

In [None]:
def ResNet18(init_filters = 64, drop_prob = 0.1, dense_nodes = 1000, ExtraPooling = False):
    '''This function defines the original ResNet18 network'''
    
    # initialization of the weights
    W_init = tf.initializers.GlorotUniform()
    
    # Input
    X_Input = tf.keras.layers.Input(shape = (256,256, 3))
    
    # Convolutional layer 1
    X = tf.keras.layers.ZeroPadding2D((3, 3))(X_Input)
    X = tf.keras.layers.Conv2D(init_filters, (7,7), strides=(2, 2), kernel_initializer= W_init)(X)
    X = tf.keras.layers.BatchNormalization(axis = 1)(X)
    X = tf.nn.relu(X)
    
    # Convolutional layer 2
    X = tf.keras.layers.ZeroPadding2D((1, 1))(X)
    X = tf.keras.layers.MaxPool2D((3,3), strides = (2,2))(X)
    X = Identity_Block(X, init_filters, dropout_prob = drop_prob)
    X = Identity_Block(X, init_filters, dropout_prob = 0)

    # Convolutional layer 3
    X = tf.keras.layers.ZeroPadding2D((1, 1))(X)
    X = Convolutional_Block(X, 2* init_filters, dropout_prob = drop_prob, pooling = ExtraPooling)
    X = Identity_Block(X, 2* init_filters, dropout_prob = 0)

    # Convolutional layer 4
    X = tf.keras.layers.ZeroPadding2D((1, 1))(X)
    X = Convolutional_Block(X,  4* init_filters, dropout_prob = drop_prob, pooling = ExtraPooling)
    X = Identity_Block(X, 4* init_filters, dropout_prob = 0)

    # Convolutional layer 5
    X = tf.keras.layers.ZeroPadding2D((1, 1))(X)
    X = Convolutional_Block(X, 8* init_filters, dropout_prob = drop_prob)
    X = Identity_Block(X, 8* init_filters, dropout_prob = 0)

    # Output layer
    X = tf.keras.layers.AveragePooling2D((2,2))(X)
    X = tf.keras.layers.Flatten()(X)
    X = tf.keras.layers.Dense(dense_nodes, activation= tf.nn.relu)(X)
    X_Output = tf.keras.layers.Dense(2, activation= tf.nn.softmax)(X) # dense layer where the chance on benign and malignant is represented

    # define the model
    model = tf.keras.Model(inputs = X_Input, outputs = X_Output)
      
    return model

In [None]:
# loss function
def Bin_CrossEntropy_Loss(pred_labels, true_labels, GlobalBatchSize):
    true_labels_pos = true_labels[:,0]
    pred_labels_pos = pred_labels[:,0]

    loss_object = tf.keras.losses.BinaryCrossentropy(reduction= tf.keras.losses.Reduction.NONE)
    loss = loss_object([true_labels_pos], [pred_labels_pos])[0]
    return (loss/ GlobalBatchSize)

In [None]:
def train_network(TrainImages, TrainLabels, TestImages, TestLabels, 
                  Drop_Prob = 0.1, Init_Filters = 64, Dense_Nodes = 1000, extra_pooling = False, batch_size = 3, loss_function = 'BinCrossEntr', optim = 'Adam', 
                  learning_rate = tf.Variable(1e-5, dtype=tf.float32), MAX_EPOCH = 10, SaveResults = True, print_freq = 1):
    '''
    This function trains the UNet on the indicated train data with corresponding annotations
    At the end the trained model is being saved
    '''
    # setting up saver for the tensorboard logs
    if SaveResults:
        # creating summary which stores the results that can be visualised with tensorboard
        print("Setting up summary writer for tensorboard...")
        summary_writer = tf.summary.create_file_writer(log_dir_tens)

    # define the train and test batches that can be fed into the network
    # global batch size defines the batch size over all availabel GPU's
    print('Creating distributed data')
    Global_batch_size = batch_size * strategy.num_replicas_in_sync
    train_batch_data  = tf.data.Dataset.from_tensor_slices((TrainImages, TrainLabels)).shuffle(TrainImages.shape[0]).batch(Global_batch_size) 
    test_batch_data = tf.data.Dataset.from_tensor_slices((TestImages, TestLabels)).batch(Global_batch_size) 

    # distribute the data over the different GPU's
    train_dist_data =  strategy.experimental_distribute_dataset(train_batch_data)
    test_dist_data =  strategy.experimental_distribute_dataset(test_batch_data)

    # define the model that will be used for training and for testing
    # the model, optimisation and loss have to be distributed among GPU's
    tf.compat.v1.reset_default_graph()
    with strategy.scope():

        # model
        print('Defining the model')
        model = ResNet18(init_filters = Init_Filters, drop_prob = Drop_Prob, dense_nodes= Dense_Nodes, ExtraPooling= extra_pooling)
        
        # loss
        print('Defining loss')
        def compute_loss(PredictedLabels, TrueLabels):
            if loss_function == 'BinCrossEntr':
                loss = Bin_CrossEntropy_Loss(PredictedLabels, TrueLabels, Global_batch_size)
            return loss

        # optimization
        # a decaying learning rate is used
        steps_per_epoch = int(TrainImages.shape[0]/Global_batch_size)
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate= learning_rate, decay_steps= MAX_EPOCH*steps_per_epoch*0.25, decay_rate=0.2, staircase = True)
        print('Defining optimization')
        if optim == 'Adam':
            train_op = tf.keras.optimizers.Adam(learning_rate = lr_schedule)
        elif optim == 'sgd':
            train_op = tf.keras.optimizers.SGD(learning_rate = lr_schedule)

        # defining the metrics
        print('Defining the metrics')
        train_roc_auc = tf.keras.metrics.AUC(curve = 'ROC')
        train_pr_auc = tf.keras.metrics.AUC(curve = 'PR')
        test_roc_auc = tf.keras.metrics.AUC(curve = 'ROC')
        test_pr_auc = tf.keras.metrics.AUC(curve = 'PR')

    # one train step is a step in which one batch of data is fed to every GPU
    # one train step is a step in which one batch of data is fed to every GPU
    def Train_Step(input):

        with tf.GradientTape() as tape:
            train_images_batch, train_labels_batch = input
        
            # make prediction with model
            pred_train_labels = model(train_images_batch, training = True)
            # compute loss
            train_err = compute_loss(pred_train_labels, train_labels_batch)
        
        # update model
        train_weights = model.trainable_variables
        gradients = tape.gradient(train_err, train_weights)
        train_op.apply_gradients(zip(gradients, train_weights))

        # compute auc and aupr score
        temp_train_labels = train_labels_batch[:,0]
        temp_train_pred_labels = pred_train_labels[:,0]
        train_roc_auc.update_state(temp_train_labels, temp_train_pred_labels)
        train_pr_auc.update_state(temp_train_labels, temp_train_pred_labels)

        # the error per replica is returned
        return train_err, train_roc_auc.result(), train_pr_auc.result()


    # for the last epoch some testing has to be done, in a test step one batch of test data is fed to every GPU
    def Test_Step(input):
        test_images_batch, test_labels_batch = input

        # make prediction with model
        pred_test_labels = model(test_images_batch, training = False)
        # compute loss
        test_err = compute_loss(pred_test_labels, test_labels_batch)
        
        # compute auc and aupr score
        temp_test_labels = test_labels_batch[:,0]
        temp_test_pred_labels = pred_test_labels[:,0]
        test_roc_auc.update_state(temp_test_labels, temp_test_pred_labels)
        test_pr_auc.update_state(temp_test_labels, temp_test_pred_labels)

        # the error per replica is returned
        return test_err, test_roc_auc.result(), test_pr_auc.result()

    @tf.function
    def distributed_train_step(dataset_inputs):
        per_replica_losses, per_replica_roc_auc, per_replica_pr_auc = strategy.run(Train_Step, args=(dataset_inputs,))
        return per_replica_losses, per_replica_roc_auc, per_replica_pr_auc

    @tf.function
    def distributed_test_step(dataset_inputs):
        per_replica_losses, per_replica_roc_auc, per_replica_pr_auc = strategy.run(Test_Step, args=(dataset_inputs,))
        return per_replica_losses, per_replica_roc_auc, per_replica_pr_auc
        
    
    # the train and test steps now have to be performed with the distributed strategy
    print('Training')
    for epo in range(1,MAX_EPOCH+1):
        start_time = time.time()
        
        n_train_steps = 0
        total_train_loss = 0
        total_train_auc = 0
        total_train_aupr = 0
        # go over all global batches
        for train_input_data in train_dist_data:
            n_train_steps+=1
            per_replica_train_losses, per_replica_train_roc_auc, per_replica_train_pr_auc = distributed_train_step(train_input_data)
            total_train_loss += strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_train_losses, axis=None)
            total_train_auc += strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_train_roc_auc, axis=None)
            total_train_aupr += strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_train_pr_auc, axis=None)
    

        # every print frequency the train and test resutls are printed out
        if epo % print_freq == 0 or epo == 1 or epo == (MAX_EPOCH):

            # calculate the final training results for this epoch
            total_train_loss = total_train_loss/n_train_steps
            total_train_auc = total_train_auc/n_train_steps
            total_train_aupr = total_train_aupr/n_train_steps

            # print out the train results
            print('epoch {} took {}s'.format(epo, time.time() - start_time))
            print('   train loss: {}'.format(total_train_loss))
            print('   train auc: {}'.format(total_train_auc))
            print('   train aupr: {}'.format(total_train_aupr))

            if SaveResults:      
                # save these values to visualize them later with tensorboard
                with summary_writer.as_default():
                    tf.summary.scalar('train_loss', total_train_loss, step = epo)
                    tf.summary.scalar('train_roc_auc', total_train_auc, step = epo)
                    tf.summary.scalar('train_pr_auc', total_train_aupr, step = epo)


            # some testing has to be done at these print frequencies
            print('Testing')
            n_test_steps = 0
            total_test_loss = 0
            total_test_auc = 0
            total_test_aupr = 0

            for test_input_data in test_dist_data:
                n_test_steps+=1
                per_replica_test_losses, per_replica_test_roc_auc, per_replica_test_pr_auc = distributed_test_step(test_input_data)
                total_test_loss += strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_test_losses, axis=None)
                total_test_auc += strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_test_roc_auc, axis=None)
                total_test_aupr += strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_test_pr_auc, axis=None)

            total_test_loss = total_test_loss/n_test_steps
            total_test_auc = total_test_auc/n_test_steps
            total_test_aupr = total_test_aupr/n_test_steps

            # print out the test results      
            print('   validation loss: {}'.format(total_test_loss))
            print('   validation auc: {}'.format(total_test_auc))
            print('   validation aupr: {}'.format(total_test_aupr))

            if SaveResults:
                with summary_writer.as_default():
                    tf.summary.scalar('validation_loss', total_test_loss, step = epo)
                    tf.summary.scalar('validation_roc_auc', total_test_auc, step = epo)
                    tf.summary.scalar('validation_pr_auc', total_test_aupr, step = epo)  
                    summary_writer.flush()     


        if SaveResults:
        # storing the model weights at two time-points
            if epo == int(MAX_EPOCH/2):
                print('Saving the intermediate model weights...')
                model.save_weights(log_dir_model + 'ResNet18_' + str(epo) +'_epochs')
                print('Done')

            if epo == MAX_EPOCH:
                print('Saving the model weights...')
                model.save_weights(log_dir_model + 'ResNet18_' + str(epo) +'_epochs')
                print('Done')

In [None]:
train_network(train_images, train_labels, test_images, test_labels, MAX_EPOCH = 100, learning_rate = tf.Variable(1e-5, dtype=tf.float32), extra_pooling = True, Drop_Prob = 0.5, Dense_Nodes = 100, batch_size = 3)
# (TrainImages, TrainLabels, TestImages, TestLabels, 
#                   Drop_Prob = 0.1, Init_Filters = 64, Dense_Nodes = 1000, extra_pooling = False, batch_size = 3, loss_function = 'BinCrossEntr', optim = 'Adam', 
#                   learning_rate = tf.Variable(1e-5, dtype=tf.float32), MAX_EPOCH = 10, SaveResults = True, print_freq = 1)

Setting up summary writer for tensorboard...
Creating distributed data
Defining the model
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0

KeyboardInterrupt: ignored