# Env Configuration

In [1]:
from google.colab import drive
drive.mount('drive', force_remount=True)
!pip install keras_metrics

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive
Collecting keras_metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


## Setup

In [2]:
import os
import time
import gc
import numpy as np
import pandas as pd
from scipy import interp
from hyperopt import Trials, STATUS_OK, tpe
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc
from keras import backend as K
from keras import optimizers
from keras.layers import Conv3D, MaxPool3D, Flatten, Dense, Dropout, Input, concatenate
from keras.losses import binary_crossentropy
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model, model_to_dot
import keras_metrics as km
import math
import itertools
import re
import os
import imageio
from scipy.ndimage import rotate
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import shutil
import matplotlib.pyplot as plt

'''Data settings'''
rotations_of_benignant = 10
rotations_of_malignant = 10
slices_per_nodule      = 5
base_dir               = "/content/drive/My Drive/masters/Research/data/"
images_dir             = base_dir + "images/solid-nodules/"
features_path          = base_dir + "features/solidNodules.csv"
features_names         = pd.read_csv(features_path).columns[2:73]
destination_folder     = base_dir + "convolutional_features/deep_features_with_radiomics/"
images_resolution      = 64

'''Train/Validation settings'''
LEARNING_RATE = 0.0001
FOLDS         = 10

'''Model params'''
convolucional_layer_units = 96
dense_layer_units_1       = 64
dense_layer_units_2       = 24
dropout_layer_1           = 0.241
dropout_layer_2           = 0.236
input_shape               = (64, 64, 5, 1)

'''Radiomic Feature Sets'''
shape_features = [14, 15, 16, 17, 18, 19, 20, 21]

intensity_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

texture_features = [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 
                    42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]

edge_sharpness_features = [59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]

optimized_features = [0, 1, 2, 4, 7, 9, 10, 16, 17, 20, 24, 26, 27, 28, 29, 30, 31, 33, 36, 38, 
                      39, 42, 44, 45, 46, 48, 51, 52, 54, 55, 57, 58, 60, 61, 63, 65, 69, 70]

all_features_set = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
                    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 
                    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 
                    57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]

'''Iterables'''
result_dfs = {}
layers     = ['dense1', 'dense2']
feat_sets  = {'shape_features': shape_features, 
              'intensity_features': intensity_features,
              'texture_features': texture_features,
              'edge_sharpness_features': edge_sharpness_features,
              'optimized_features': optimized_features, 
              'all_features_set': all_features_set, 
              'none': []
              }

## Normalize getting the first slices
Function that normalize getting the first slices

In [3]:
def normalize_first(nodules, n_slices):
    '''Normalizes the nodule slices number:
    - A nodule with less than n slices is completed with black slices
    - A nodule with more than n slices have its n first slices selected'''
    
    normalized_slices = []

    for nodule in nodules:
        new_nodule = []

        if len(nodule) <= n_slices:
                for slice in nodule:
                    new_nodule.append(slice)
                for i in range(n_slices - len(nodule)):
                    new_nodule.append(np.zeros((images_resolution, images_resolution)))
        elif len(nodule) > n_slices:
            for i in range(0, n_slices):
                new_nodule.append(nodule[i])
        normalized_slices.append(new_nodule)
    return normalized_slices

## Data augmentation
Function that augment the data by rotating the slices of the nodules


In [4]:
def rotate_slices(nodules, f, times, mode='constant'):
    ''' Rotates a list of images n times'''
    
    rotated = nodules
    angle = 360/times
    rep_feat = f

    for i in range(1, times):
        temp = rotate(nodules, i*angle, (1, 2), reshape=False, mode=mode)
        rotated     = np.concatenate([rotated, temp])
        rep_feat    = np.concatenate([rep_feat, f])

    return rotated, rep_feat

## Read images
Function to read images from files and returns a list of numpy

In [5]:
def read_images(path, path_features):
    '''Reads the images files in our file structure and mounts an array
    Parameters:
        path (string): path to the nodules folders
        path_features (string): path to the features .csv
    Returns:
        list: list of nodules with slices as Numpy Arrays
        features: list of features corresponding to the nodules on list'''
    
    df = pd.read_csv(path_features)
    
    scaler = MinMaxScaler(copy=False)
    df[df.columns[2:73]] = scaler.fit_transform(df[df.columns[2:73]])
    
    allFeatures = df.values

    lista    = []
    features = []

    for _, dirs, _ in os.walk(path):
        for dirname in sorted(dirs, key=str.lower):
            for _, dirs1, _ in os.walk(path + "/" + dirname):
                for dirname1 in sorted(dirs1, key=str.lower):
                    for root2, _, files2 in os.walk(path + "/" + dirname + "/" + dirname1):
                        slices = []
                        files2[:] = [re.findall('\d+', x)[0] for x in files2]

                        axis         = 0 # To get the Rows indices
                        examColumn   = 0 # Column of the csv where the exam code is
                        noduleColumn = 1 # Column of the csv where the nodule code is

                        # index of the rows that have the exam id equal to the exam id of the current nodule
                        indExam = np.where(allFeatures[:,examColumn] == dirname)[axis]

                        # index of the rows that have the nodule id equal to the id of the current nodule
                        indNodule = np.where(allFeatures[:,noduleColumn] == dirname1)[axis]

                        i = np.intersect1d(indExam,indNodule)

                        # A list is returned, but there's just one value, so I used its index
                        index = 0
                        exam = allFeatures[i,examColumn][index]
                        nodule = allFeatures[i,noduleColumn][index]

                        '''Verify if there's more than one index for each nodule
                        and if there's divergence between the nodule location and the
                        csv values'''

                        if((len(i) > 1) or (str(exam) != str(dirname)) or (str(nodule) != str(dirname1))):
                            print("Features error!")
                        else:
                            '''Transform the list of index with just one value in a
                            primitive value to use as index to save the features values'''
                            i = i[0]

                        for f in sorted(files2, key=float):
                            img = imageio.imread(root2 + "/" + f + ".png", as_gray=True)
                            slices.append(img)

                        lista.append(slices)
                        
                        features.append(allFeatures[i,2:73].tolist())

    return lista, features

## My Kfold
k_folder made to get balanced data between benigno and maligno

In [6]:
def my_kfold(ben, mal, f_ben, f_mal, n_splits, ben_rot, mal_rot):
    kf = KFold(n_splits)
    
    f_mal_train, f_mal_test = [], []
    mal_train, mal_test = [], []
    for train_index, test_index in kf.split(mal):
        mal_train.append([mal[index] for index in train_index])
        f_mal_train.append([f_mal[index] for index in train_index])

        mal_test.append([mal[index] for index in test_index])
        f_mal_test.append([f_mal[index] for index in test_index])

    ben_train, ben_test = [], []
    f_ben_train, f_ben_test = [], []
    
    # percorro o mal_test para que os folds de test tenham o mesmo número de itens
    for (train_index, test_index), mal_te, mal_tr in zip(kf.split(ben), mal_test, mal_train):
        
        sample = np.random.choice(test_index, len(mal_te), replace=False)
        sample_ = np.setdiff1d(test_index, sample)

        ben_train_ind = np.concatenate((train_index, sample_))

        '''This line guarantees that the ben and mal train batches are the same size'''
        ben_train_ind = np.random.choice(ben_train_ind, len(mal_tr), replace=False)

        ben_train.append([ben[index] for index in ben_train_ind])
        f_ben_train.append([f_ben[index] for index in ben_train_ind])
        
        ben_test.append([ben[index] for index in sample])
        f_ben_test.append([f_ben[index] for index in sample])

    X_test, Y_test = [], []
    for b, m in zip(ben_test, mal_test):
        X_test.append(np.concatenate((b, m), 0))

        y_test = len(b) * [0] + len(m) * [1]
        Y_test.append(np.array(y_test))

    f_test = []
    for f_b, f_m in zip(f_ben_test, f_mal_test):
        
        f_test.append(np.concatenate((f_b, f_m), 0))

    X_train, Y_train = [], []
    f_train = []
    for i in tqdm(range(n_splits)):
        print("INDEX: ", i)
        print("ben_train: ", len(ben_train[i]))
        print("mal_train: ", len(mal_train[i]))
        print("ben_test: ",  len(ben_test[i]))
        print("mal_test: ",  len(mal_test[i]))
        
        b, m = ben_train[i], mal_train[i]
        f_b_train, f_m_train = f_ben_train[i], f_mal_train[i]

        b, f_b_train = rotate_slices(nodules=b, f=f_b_train, times=ben_rot)
        m, f_m_train = rotate_slices(nodules=m, f=f_m_train, times=mal_rot)

        X_train.append(np.concatenate((b, m), 0))
        f_train.append(np.concatenate((f_b_train, f_m_train), 0))

        y_train = len(b) * [0] + len(m) * [1]
        Y_train.append(np.array(y_train))

    return X_train, X_test, f_train, f_test, Y_train, Y_test

## Get folds
Function that is called to get the folds of the cross validation

In [7]:
def get_folds(basedir, n_slices, features=None):
    ben_dir = basedir + "benigno"
    mal_dir = basedir + "maligno"

    ben, f_ben = read_images(ben_dir, features)
    mal, f_mal = read_images(mal_dir, features)

    ben = normalize_first(ben, n_slices)
    mal = normalize_first(mal, n_slices)

    ben = np.array(ben).reshape(len(ben), n_slices, images_resolution, images_resolution, 1)
    mal = np.array(mal).reshape(len(mal), n_slices, images_resolution, images_resolution, 1)

    ben = np.moveaxis(ben, 1, 3)
    mal = np.moveaxis(mal, 1, 3)

    ben_zip = list(zip(ben, f_ben))
    np.random.shuffle(ben_zip)
    ben, f_ben = zip(*ben_zip)

    mal_zip = list(zip(mal, f_mal))
    np.random.shuffle(mal_zip)
    mal, f_mal = zip(*mal_zip)

    X_train, X_test, f_train, f_test, Y_train, Y_test = my_kfold(ben, mal, f_ben, 
                                                                 f_mal, FOLDS, 
                                                                 rotations_of_benignant, 
                                                                 rotations_of_malignant)

    return X_train, X_test, f_train, f_test, Y_train, Y_test

# Valition code


In [8]:
def get_model():
    input_layer = Input(input_shape)
    conv_layer1 = Conv3D(filters=convolucional_layer_units, kernel_size=(3, 3, 3), 
                        activation='relu')(input_layer)
    pooling_layer1 = MaxPool3D(pool_size=(2, 2, 2))(conv_layer1)
    
    flatten_layer  = Flatten(name='flatten')(pooling_layer1)
    
    dense_layer1 = Dense(units=dense_layer_units_1, activation='relu', name='dense1')(flatten_layer)
    dense_layer1 = Dropout(dropout_layer_1)(dense_layer1)

    dense_layer2 = Dense(units=dense_layer_units_2, activation='relu', name='dense2')(dense_layer1)
    dense_layer2 = Dropout(dropout_layer_2)(dense_layer2)

    output_layer = Dense(units=1, activation='sigmoid')(dense_layer2)
    
    model = []
    model = Model(inputs=input_layer, outputs=output_layer)
      
    opt = optimizers.RMSprop(lr=LEARNING_RATE)

    model.compile(loss=binary_crossentropy, optimizer=opt, 
    metrics=['accuracy', km.binary_true_positive(), km.binary_true_negative(), 
                km.binary_false_positive(), km.binary_false_negative(), 
                km.binary_f1_score()])

    return model

## Cross-validation

In [None]:
start = time.time()
X_train_, X_test_, f_train_, f_test_, Y_train_, Y_test_ = get_folds(basedir=images_dir, n_slices=slices_per_nodule, features=features_path)

i = 0
for X_train, X_test, f_train, f_test, Y_train, Y_test in zip(X_train_, X_test_, f_train_, f_test_, Y_train_, Y_test_):
  i += 1
  print("ITERATION: " + str(i))
  
  model = get_model()
  model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=0)

  for layer in layers:
    intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(layer).output)
    output                   = intermediate_layer_model.predict(X_test)
    
    for feature_set in feat_sets:
      deep_features_df = pd.DataFrame(output)
      file_name        = destination_folder + (layer) + "_" + feature_set + ".csv"
      
      print(file_name)
      features_selected_set = feat_sets[feature_set]
      
      if (feature_set != 'none'):
        features_selected_set_df = pd.DataFrame(f_test[:,features_selected_set], columns=features_names[features_selected_set])
        concat_features_df       = pd.concat([deep_features_df, features_selected_set_df], axis=1, sort=False)
        deep_features_df         = concat_features_df
      
      deep_features_df['class'] = Y_test
      
      if (file_name not in result_dfs):
        result_dfs[file_name] = deep_features_df
      else:
        result_dfs[file_name] = pd.concat([result_dfs[file_name], deep_features_df])

for df in result_dfs:
  result_dfs[df].to_csv(df, index=False)