<a href="https://colab.research.google.com/github/MiHess/cxr-bse/blob/master/data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import glob
import numpy as np
import PIL
import logging
import random
import matplotlib.pyplot as plt
plt.style.use('default')

from collections import defaultdict
from tqdm import tqdm

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img


In [0]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [9]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)


Mounted at /content/drive/


In [0]:
bse_data_path = "/content/drive/My Drive/dev/bse/data"

jsrt_bse_path = os.path.join(bse_data_path, "jsrt_bse")

jsrt_path = os.path.join(bse_data_path, "jsrt")


In [0]:
def _load_grayscale_image(filepath):
    """
    """
    img_array = np.array(load_img(filepath))[:,:,1]
    
    return img_array.reshape((1,) + img_array.shape)  
    

def get_train_test_data(X_images_path, y_images_path, test_fraction=0.2):
    """ Loads all images from disk to memory and returns them as numpy array.
    """
    EXPECTED_TOTAL_NUMBER = 247
    
    X_image_filepaths = sorted(glob.glob(os.path.join(X_images_path, "*.png")))
    y_image_filepaths = sorted(glob.glob(os.path.join(y_images_path, "*.png")))
    
    if (len(X_image_filepaths) != EXPECTED_TOTAL_NUMBER) or (len(y_image_filepaths) != EXPECTED_TOTAL_NUMBER):
        raise ValueError(f"expected {EXPECTED_TOTAL_NUMBER} images. "
            f"Found {len(X_image_filepaths)} X and {len(y_image_filepaths)} y images, respectively.")
    else:
        logger.info(f"Found {len(X_image_filepaths)} X images and {len(y_image_filepaths)} y images.")    

    X_images = []
    y_images = []
    for X_image_filepath, y_image_filepath in zip(X_image_filepaths, y_image_filepaths):
        if os.path.basename(X_image_filepath) == os.path.basename(y_image_filepath):
            X_images.append(_load_grayscale_image(X_image_filepath))
            y_images.append(_load_grayscale_image(y_image_filepath))
        else:
            raise ValueError(f"image names do not match: {X_image_filepath} and {y_image_filepath}")
        
    test_idc = random.sample(range(EXPECTED_TOTAL_NUMBER), int(EXPECTED_TOTAL_NUMBER * test_fraction))
    train_idc = [idx for idx in range(EXPECTED_TOTAL_NUMBER) if idx not in test_idc]
    
    return np.array(X_images)[train_idc], np.array(y_images)[train_idc], \
        np.array(X_images)[test_idc], np.array(y_images)[test_idc]
        


In [128]:
X_train, y_train, X_test, y_test = get_train_test_data(jsrt_path, jsrt_bse_path)

INFO:root:Found 247 X images and 247 y images.


In [132]:
X_train.shape
y_train.shape
X_test.shape
y_test.shape

(49, 1, 512, 512)

In [15]:
batch_size = 16

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1./255)

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        jsrt_bse_path,  # this is the target directory
        target_size=(150, 150),  # all images will be resized to 150x150
        batch_size=batch_size,
        color_mode="grayscale",
        class_mode="input")  # since we use binary_crossentropy loss, we need binary labels

Found 0 images belonging to 0 classes.


In [0]:
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        'data/train',
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
        'data/validation',
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')

In [0]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True)

# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
datagen.fit(x_train)

# fits the model on batches with real-time data augmentation:
model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
                    steps_per_epoch=len(x_train) / 32, epochs=epochs)

# here's a more "manual" example
for e in range(epochs):
    print('Epoch', e)
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
        model.fit(x_batch, y_batch)
        batches += 1
        if batches >= len(x_train) / 32:
            # we need to break the loop by hand because
            # the generator loops indefinitely
            break

In [0]:
raw_jsrt_filepaths = glob.glob(os.path.join(raw_jsrt_path, "*.IMG"))

for raw_jsrt_bse_filepath in tqdm(raw_jsrt_bse_filepaths):
    process_raw_image(raw_jsrt_bse_filepath, load_raw_jsrt_bse_image, target_jsrt_bse_path)
    

100%|██████████| 247/247 [03:21<00:00,  1.65it/s]
