# Configure enviroment

In [0]:
from google.colab import drive
drive.mount('drive', force_remount=True)

# Imports

In [0]:
import math
import itertools
import re
import os
import imageio
import numpy as np
from scipy.ndimage import rotate
from sklearn.model_selection import KFold
from tqdm import tqdm
import shutil

# Settings

In [0]:
# Random seed, to help on reproduction of this study
np.random.seed(1937)

# Resolution of the images
RES = 64

# Size of test split
TEST_SIZE = 50


# ------------------------------data type settings------------------------------

# Number of slices for each nodule
SLICES = 5

# Strategy used to slices selection
STRATEGY = 'balanced'

# Determinates if the repeating of slices is allowed
REPEAT = False

prefix = "/content/drive/My Drive/Pesquisa - Dicom images/data"

# Folder where the numpy files will be saved
np_folder = prefix + "/nps/solid-nodules/data-" + str(SLICES) + "-" + str(STRATEGY)
if (REPEAT):
    np_folder += "-repeat"

# ------------------------------------------------------------------------------

base_dir = prefix + "/images/solid-nodules-with-attributes/"

# Folder for the benigno image files
ben_dir = base_dir + "benigno"

# Folder for the maligno image files
mal_dir = base_dir + "maligno"

# Balanced Strategy

In [0]:
def normalize_balanced(nodules, n_slices, repeat=False):
    '''Normalizes the nodule slices number:
    - A nodule with less than n slices is completed with black slices
    - A nodule with more than n slices have its first and last one selected, plus
    the 1 + (n-1/5)*k, where k = {1, 2, 3, 4}
    '''
    normalized_slices = []

    for nodule in nodules:
        new_nodule = []
        # adds black slices

        if repeat:
            times = math.ceil(n_slices/len(nodule))
            nodule = list(itertools.chain.from_iterable(itertools.repeat(x, times) for x in nodule))

        if len(nodule) <= n_slices:
                for slice in nodule:
                    new_nodule.append(slice)
                for i in range(n_slices - len(nodule)):
                    new_nodule.append(np.zeros((RES, RES)))
        elif len(nodule) > n_slices:
            new_nodule.append(nodule[0])
            for k in range(1, n_slices-1):
                new_nodule.append(nodule[round(1 + ((len(nodule) - 1) / (n_slices-1)) * k)])
            new_nodule.append(nodule[-1])
        normalized_slices.append(new_nodule)
    return normalized_slices

# Get first slices Strategy

In [0]:
def normalize_first(nodules, n_slices, repeat=False):
    '''Normalizes the nodule slices number:
    - A nodule with less than n slices is completed with black slices
    - A nodule with more than n slices have its n first slices selected
    '''
    normalized_slices = []

    for nodule in nodules:
        new_nodule = []

        if repeat:
            times = math.ceil(n_slices/len(nodule))
            nodule = list(itertools.chain.from_iterable(itertools.repeat(x, times) for x in nodule))

        if len(nodule) <= n_slices:
                for slice in nodule:
                    new_nodule.append(slice)
                for i in range(n_slices - len(nodule)):
                    new_nodule.append(np.zeros((RES, RES)))
        elif len(nodule) > n_slices:
            for i in range(0, n_slices):
                new_nodule.append(nodule[i])
        normalized_slices.append(new_nodule)
    return normalized_slices

# Read images and features, connecting each nodule to its features

In [0]:
def read_images(path, category):
    '''Reads the images files in our file structure and mounts an array
    Parameters:
        path (string): path to the nodules folders
        category (string): benigno or maligno
    Returns:
        list: list of nodules with slices as Numpy Arrays
    '''
    lista = []

    for root, dirs, files in os.walk(path):
        for dirname in sorted(dirs, key=str.lower):
            for root1, dirs1, files1 in os.walk(path + "/" + dirname):
                for dirname1 in sorted(dirs1, key=str.lower):
                    for root2, dirs2, files2 in os.walk(path + "/" + dirname + "/" + dirname1):
                        slices = []
                        files2[:] = [re.findall('\d+', x)[0] for x in files2]

                        for f in sorted(files2, key=float):
                            img = imageio.imread(root2 + "/" + f + ".png", as_gray=True)
                            slices.append(img)

                        lista.append(slices)
    return lista

# Data augmentation

In [0]:
def rotate_slices(slices, times, mode='constant'):
    ''' Rotates a list of images n times'''
    rotated = slices
    angle = 360/times
    for i in range(1, times):
        temp = rotate(slices, i*angle, (1, 2), reshape=False, mode = mode)
        rotated = np.concatenate([rotated, temp])
    return rotated

# Script to read and save data on numpy files

In [0]:
if __name__ == "__main__":
    
    print("Lendo imagens do disco")

    ben = read_images(ben_dir, "benigno")
    mal = read_images(mal_dir, "maligno")

    if (STRATEGY == 'first'):
        ben = normalize_first(ben, SLICES, REPEAT)
        mal = normalize_first(mal, SLICES, REPEAT)
    else:
        ben = normalize_balanced(ben, SLICES, REPEAT)
        mal = normalize_balanced(mal, SLICES, REPEAT)

    print("Mudando a forma")

    print(">", len(ben))

    ben = np.concatenate(ben).reshape(len(ben), SLICES, RES, RES, 1)
    mal = np.concatenate(mal).reshape(len(mal), SLICES, RES, RES, 1)

    print("Trocando os eixos")

    print("Antes: ", ben.shape)

    ben = np.moveaxis(ben, 1, 3)
    mal = np.moveaxis(mal, 1, 3)

    print("Depois: ", ben.shape)

    print("Separando dados de teste")

    ben_test_indices = np.random.choice(len(ben), TEST_SIZE, replace=False)
    mal_test_indices = np.random.choice(len(mal), TEST_SIZE, replace=False)

    ben_test = [ben[i] for i in ben_test_indices]
    mal_test = [mal[i] for i in mal_test_indices]

    ben_test = np.array(ben_test)
    mal_test = np.array(mal_test)

    ben_train = np.delete(ben, ben_test_indices, axis = 0)
    mal_train = np.delete(mal, mal_test_indices, axis = 0)

    del(ben, mal, ben_dir, mal_dir, ben_test_indices, mal_test_indices)

    print("Aumento de base")

    ben_train = rotate_slices(ben_train, 5)#, 'reflect')
    mal_train = rotate_slices(mal_train, 13)#, 'reflect')

    print("Juntando benignos e malignos")

    X_train = np.concatenate([ben_train, mal_train])
    X_test = np.concatenate([ben_test, mal_test])

    print("Gerando labels")

    train_labels = len(ben_train) * [0] + len(mal_train) * [1]
    test_labels = len(ben_test) * [0] + len(mal_test) * [1]

    print("Tipo categórico")

    Y_train = np.array(train_labels)
    Y_test = np.array(test_labels)

    data = np_folder

    shutil.rmtree(data, ignore_errors=True)
    os.mkdir(data)

    np.save(data + "/X_train.npy", X_train)
    np.save(data + "/X_test.npy", X_test)
    np.save(data + "/Y_train.npy", Y_train)
    np.save(data + "/Y_test.npy", Y_test)

Lendo imagens do disco
