In [1]:
# Primero cargamos las imágenes procesadas:

import numpy as np
import h5py

hf = h5py.File("Preprocesado/processed.h5", "r")
processed_images = np.array(hf.get("processed_images"))

In [24]:
# Antes de nada hemos de crear índices para el train y test para así aumentar por separado el training y el test.
import os

cantidad_imagenes = len(os.listdir("Preprocesado/Heridas/Originales"))

size_test = int(0.3*cantidad_imagenes)
size_train = cantidad_imagenes-size_test

indexes = list(range(0,cantidad_imagenes))

index_train = list(np.random.choice(indexes, size_train, False))
index_test = list(np.setdiff1d(indexes,index_train))

In [25]:
import os
import PIL
from PIL import Image
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt
import re
from keras.preprocessing import image
files = os.listdir("Preprocesado/Heridas/Originales")

# Lo primero que hemos de hacer es automatizar el proceso de recortar en cuadrados las imágenes de heridas y de los tejidos sanos de la misma imagen.

def cut(path_load, name):
    import numpy as np
    import PIL
    from PIL import Image
    from keras.preprocessing import image

    img = image.img_to_array(Image.open(path_load+"/"+name))

    # Guardamos las dimensiones de la imagen
    dims = (img.shape[0], img.shape[1])

    # Vamos a realizar una suma en los 3 canales RBG, por lo que tendremos un array de Width x Height.
    img_sum = np.sum(img, axis = 2)

    # Ahora trataremos de encontrar el píxel mínimo y máximo en cada dimensión diferente de cero.
    indexs = [[],[]]

    [indexs[0], indexs[1]] = [[np.where(img_sum[i,:]>0) for i in range(dims[0])], [np.where(img_sum[:,i]>0) for i in range(dims[1])]]

    # En indexs[0] tenemos 480 arrays con los indices de los píxeles que son mayores de cero para la primera dimensión. De forma análoga para indexs[1].

    min_x = [indexs[1][i][0][0] if len(indexs[1][i][0] > 0) else -1 for i in range(len(indexs[1]))]
    max_x = [indexs[1][i][0][len(indexs[1][i][0])-1] if len(indexs[1][i][0] > 0) else -1 for i in range(len(indexs[1]))]

    min_x = np.min(list(filter(lambda number: number > 0, min_x)))
    max_x = np.max(list(filter(lambda number: number > 0, max_x)))

    limits_x = (min_x, max_x)

    min_y = [indexs[0][i][0][0] if len(indexs[0][i][0] > 0) else -1 for i in range(len(indexs[0]))]
    max_y = [indexs[0][i][0][len(indexs[0][i][0])-1] if len(indexs[0][i][0] > 0) else -1 for i in range(len(indexs[0]))]

    min_y = np.min(list(filter(lambda number: number > 0, min_y)))
    max_y = np.max(list(filter(lambda number: number > 0, max_y)))

    limits_y = (min_y, max_y)

    new_dims = np.max([limits_x[1]-limits_x[0], limits_y[1]-limits_y[0]])

    # Ahora hemos de recortar la imagen.

    new_limits = [(limits_x[0],limits_y[0]),(limits_x[0]+new_dims, limits_y[0]+new_dims)]

    img_new = img[new_limits[0][0]:new_limits[1][0],new_limits[0][1]:new_limits[1][1],:]

    # Si la imagen no es un cuadrado la rellenaremos con píxeles en negro.

    if img_new.shape[0] != img_new.shape[1]:
        index_min = np.where(np.array([img_new.shape[0], img_new.shape[1]]) == np.min(np.array([img_new.shape[0], img_new.shape[1]])))[0][0]
        if index_min == 0:
            zeros = np.zeros((img_new.shape[1]-img_new.shape[0],img_new.shape[1],3))
            new_img = np.vstack((zeros, img_new))
            return new_img
        else:
            zeros = np.zeros((img_new.shape[0],img_new.shape[0]-img_new.shape[1],3))
            new_img = np.hstack((zeros, img_new))
            return new_img
    else:
        return img_new

heridas_recortadas = []
sanos_recortados = []

for i in range(len(files)):
    img1 = cut(path_load = "Preprocesado/Heridas/Originales", name = str(i+1)+".jpg")
    heridas_recortadas.append(img1)

    img2 = cut(path_load = "Preprocesado/Sanos/Originales", name = str(i+1)+".jpg")
    sanos_recortados.append(img2)

heridas_recortadas_train = [heridas_recortadas[i] for i in index_train]
sanos_recortados_train = [sanos_recortados[i] for i in index_train]

heridas_recortadas_test = [heridas_recortadas[i] for i in index_test]
sanos_recortados_test = [sanos_recortados[i] for i in index_test]

In [26]:
# Ahora haremos algo de DataAugmentation para conseguir algunas imágenes extra.

from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# Consideraremos, como primera medida, obtener 4 imágenes modificadas, aleatoriamente, de cada una de las que tenemos realizando diferentes operaciones como
# traslaciones, rotaciones, zooms, ...
# Crearemos una nueva función de preprocesado específica para este caso.

def preprocessing_dataug(herida, sano):
    import numpy as np
    import PIL
    from PIL import Image
    from keras.preprocessing import image

    img_inj = herida
    img_healthy = sano

    def standardize(img1, img2):
        mean = np.mean(img1)
        std = np.std(img1)
        img = (img2-mean)/std
        return img

    r_channel = standardize(img_healthy[:,:,0], img_inj[:,:,0])
    g_channel = standardize(img_healthy[:,:,1], img_inj[:,:,1])
    b_channel = standardize(img_healthy[:,:,2], img_inj[:,:,2])

    # Para evitar valores negativos en las imágenes de salida, aplicaremos la "unity-based normalization" tras estandarizar los canales.

    image_std = np.stack([r_channel, g_channel, b_channel], axis=-1)
    image_std_normalized = (image_std-np.min(image_std))/(np.max(image_std)-np.min(image_std))

    return image_std_normalized

In [27]:
# Hemos de aumentar train y test por separado.

new_heridas_recortadas_train = []
new_heridas_recortadas_test = []
num_new_images = 6

for i in range(len(heridas_recortadas_train)):

    dim_img = heridas_recortadas_train[i].shape[0]
    samples = expand_dims(heridas_recortadas_train[i], 0)
    datagen = ImageDataGenerator(width_shift_range = 0.2, height_shift_range = 0.2, horizontal_flip = True, rotation_range = 90, zoom_range = [0.25,0.75])
    it = datagen.flow(samples, batch_size = 1)

    for j in range(num_new_images):
        batch = it.next()
        image = batch[0].astype('uint8')
        image_new = preprocessing_dataug(image, sanos_recortados_train[i])
        new_heridas_recortadas_train.append(image_new)

for i in range(len(heridas_recortadas_test)):

    dim_img = heridas_recortadas_test[i].shape[0]
    samples = expand_dims(heridas_recortadas_test[i], 0)
    datagen = ImageDataGenerator(width_shift_range = 0.2, height_shift_range = 0.2, horizontal_flip = True, rotation_range = 90, zoom_range = [0.25,0.75])
    it = datagen.flow(samples, batch_size = 1)

    for j in range(num_new_images):
        batch = it.next()
        image = batch[0].astype('uint8')
        image_new = preprocessing_dataug(image, sanos_recortados_test[i])
        new_heridas_recortadas_test.append(image_new)

In [33]:
# Ahora hemos de preprocesar también las imágenes originales y luego concatenar ambas listas (la original y la creada)

for i in range(len(heridas_recortadas_train)):
    heridas_recortadas_train[i] = preprocessing_dataug(heridas_recortadas_train[i], sanos_recortados_train[i])

for i in range(len(heridas_recortadas_test)):
    heridas_recortadas_test[i] = preprocessing_dataug(heridas_recortadas_test[i], sanos_recortados_test[i])

data_augmented_train = heridas_recortadas_train+new_heridas_recortadas_train
data_augmented_test = heridas_recortadas_test+new_heridas_recortadas_test

In [39]:
# Ahora haremos un resize y las dispondremos en un array.

def resize(img, X, Y):
    import tensorflow as tf
    res = tf.image.resize(img, [X, Y], method = tf.image.ResizeMethod.BILINEAR, preserve_aspect_ratio = False, antialias = False, name = None).numpy()
    
    return res

sizes_train = [data_augmented_train[i].shape[0] for i in range(len(data_augmented_train))]
new_dim_train = int(round(np.mean(sizes_train)))

sizes_test = [data_augmented_test[i].shape[0] for i in range(len(data_augmented_test))]
new_dim_test = int(round(np.mean(sizes_test)))

new_dim = int(round(np.mean(sizes_train+sizes_test)))

new_data_augmented_train = [resize(data_augmented_train[i], new_dim, new_dim) for i in range(len(data_augmented_train))]
new_data_augmented_test = [resize(data_augmented_test[i], new_dim, new_dim) for i in range(len(data_augmented_test))]

new_data_augmented_train = np.stack(new_data_augmented_train, axis = 0)
new_data_augmented_test = np.stack(new_data_augmented_test, axis = 0)

In [80]:
# Ahora hemos de sacar las etiquetas del CSV.

import pandas as pd

data = pd.read_csv("Leucoplasia307.csv")

labels = np.array(data["Evolucioncancer"].values)
milabel = np.array(data["MiLabel"].values)

indices = np.where(milabel > 0)[0]
labels = list(labels[indices])
labels_train = [labels[i] for i in index_train]
labels_test = [labels[i] for i in index_test]
length_train = len(labels_train)
length_test = len(labels_test)

# No obstante estas etiquetas son únicamente las de las imágenes originales y necesitamos etiquetar también las obtenidas mediante Data Augmentation.

for i in range(length_train):
    labels_train = labels_train + [labels_train[i] for j in range(num_new_images)]

for i in range(length_test):
    labels_test = labels_test + [labels_test[i] for j in range(num_new_images)]

labels_train = np.array(labels_train)
labels_test = np.array(labels_test)

In [114]:
# Del mismo CSV también queremos extraer la histologia.
import pandas as pd

histologia = np.array(data["Histologia"].values)
histologia = list(histologia[indices])
histologia_train = [histologia[i] for i in index_train]
histologia_test = [histologia[i] for i in index_test]

for i in range(length_train):
    histologia_train = histologia_train + [histologia_train[i] for j in range(num_new_images)]

for i in range(length_test):
    histologia_test = histologia_test + [histologia_test[i] for j in range(num_new_images)]

histologia_train = pd.get_dummies(histologia_train)
histologia_train = np.array(histologia_train)

#histologia_test = pd.get_dummies(histologia_test)
#histologia_test = np.array(histologia_test)
histologia_test = histologia_train[0:21,:]

(21,)

In [76]:
# Lo almacenaremos todo en un h5 para transportarlo.

hf = h5py.File("datos_aumentados2.h5", "w")
hf.create_dataset("data_augmented_train", data = new_data_augmented_train)
hf.create_dataset("data_augmented_test", data = new_data_augmented_test)
hf.create_dataset("labels", data = labels)
hf.create_dataset("histologia_train", data = histologia_train)
hf.create_dataset("histologia_test", data = histologia_test)
hf.close()