# Image Subsetter
---

In [96]:
from scipy.misc import imread
from skimage.transform import resize
from tqdm import tqdm
import numpy as np
import glob
import uuid

In [127]:
def get_image_id(image_filepath):
    image_name   = image_filepath.split('\\')[-1]
    image_id_str = image_name[:-4]
    return image_id_str

def get_mask_from_id(image_id, masked_image_filepaths):
    # Return annotation image associated with image_id
    ann_ix = [get_image_id(fp) for fp in masked_image_filepaths].index(image_id)
    return imread(masked_image_filepaths[ann_ix])

def resize_image_and_mask(image, mask, dst_size):
    image = resize(image, dst_size, order = 0)
    mask  = resize(mask, dst_size, order  = 0)
    return image, mask

# Write sliding-window subsets of images to training, test, and validation datasets (with labels)
# Balance number of cracked images in training set by augmentation
# 1. Read in image and mask.
# 2. Slide window over mask.
# 3. For each window frame, check the number of crack (i.e. white) pixels.
# 4. If it is greater than 40px, label the image as being cracked.
# 5. Split off a window from the image and append it to the big dataset.

image_filepaths = glob.glob('.\\data\\resized-images\\*.png')
mask_filepaths  = glob.glob('.\\data\\annotated-images-masks\\*.png')

image_size    = [608, 608]
kernel_size = [128, 128]
step_size   = [30, 30]

n_cols    = np.floor((image_size[1] - kernel_size[1])/step_size[1])
n_rows    = np.floor((image_size[0] - kernel_size[0])/step_size[0])
n_patches = len(image_filepaths)*(n_rows*n_cols).astype(int)

dataset   = np.zeros([n_patches, kernel_size[0], kernel_size[1]])
masks     = np.zeros([n_patches, kernel_size[0], kernel_size[1]])
labels    = np.zeros(n_patches)
ids       = np.array([None]*n_patches)
ix        = 0

for fp in tqdm(image_filepaths):
    image       = imread(fp)
    image_id    = get_image_id(fp)
    mask        = get_mask_from_id(image_id, mask_filepaths)
    image, mask = resize_image_and_mask(image, mask, image_size)
    for r in range(0, image_size[0] - kernel_size[0], step_size[0]):
        for c in range(0, image_size[1] - kernel_size[1], step_size[1]):
            image_patch       = image[r:r + kernel_size[0], c:c + kernel_size[1]]
            mask_patch        = mask[r:r + kernel_size[0], c:c + kernel_size[1]]
            dataset[ix, :, :] = image_patch
            masks[ix, :, :]   = mask_patch
            labels[ix]        = np.heaviside(np.sum(mask_patch) - 40, 0)
            ids[ix]           = uuid.uuid4().hex
            ix += 1

print('Images subset successfully.')

100%|██████████████████████████████████████████| 52/52 [00:16<00:00,  2.09s/it]


Images subset successfully.


In [144]:
sum(labels)

480.0

In [146]:
assert 1 + 1 == 3

AssertionError: 

In [159]:
# You now have the datasets.
# Split them into train, test, cross_validation

def train_validate_test_split(X, y, frac_train=0.6, frac_validate=0.15, frac_test=0.25, stratify=True):
    try:
        assert sum([frac_train, frac_validate, frac_test]) == 1
    except AssertionError:
        raise ValueError('The training, test, and validation fractions do not sum to 1.')