*Author: Jerome Wynne (Jerome.Wynne.2014@bristol.ac.uk)*

*Last updated: 22/06/2017*

# Panel Resizer

This script resizes the PV panel images to a fixed size. It drops three images in the process.

In [1]:
# Python 3.5 should be used
import sys
print(sys.version)

3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]


In [2]:
# Get image filepaths
import glob
raw_image_fps = glob.glob('.\\data\\raw-images\*.png')

# Read in images
from scipy.misc import imread, imresize, imsave
import numpy as np
raw_images = [imread(fp) for fp in raw_image_fps]
image_ids  = np.arange(0, len(raw_images)) # These will come in handy later

# Get image dimensions
image_dimensions = [np.shape(img) for img in raw_images]

# Print image dimensions sorted by height
sorted(image_dimensions)

[(489, 479),
 (775, 772),
 (776, 773),
 (959, 967),
 (963, 954),
 (963, 956),
 (963, 966),
 (965, 962),
 (966, 962),
 (966, 962),
 (966, 963),
 (966, 964),
 (966, 964),
 (966, 965),
 (966, 965),
 (967, 956),
 (967, 958),
 (967, 963),
 (967, 964),
 (967, 964),
 (968, 955),
 (968, 956),
 (968, 963),
 (968, 964),
 (968, 965),
 (968, 965),
 (969, 955),
 (969, 957),
 (970, 965),
 (970, 973),
 (972, 963),
 (972, 964),
 (973, 963),
 (973, 963),
 (973, 963),
 (973, 963),
 (973, 963),
 (973, 963),
 (973, 964),
 (973, 964),
 (973, 964),
 (973, 964),
 (973, 964),
 (974, 962),
 (974, 962),
 (974, 963),
 (974, 963),
 (974, 963),
 (974, 963),
 (976, 962),
 (978, 963),
 (981, 963)]

As is apparent from the above, there are three images in the dataset that are smaller than the others. I'm concerned that rescaling these images might impede the ability of a classifier to identify damage and will drop them (for the time being, at least).

In [3]:
# Sort images by size then drop the first three images
image_ids  = sorted(image_ids,  key = lambda j: np.shape(raw_images[j]))[3:] # Sort the ids in the same way as the images
raw_images = sorted(raw_images, key = np.shape)[3:]

# Resize images and save them to a new folder
import os
if not os.path.exists('.\\data\\resized-images'):
    os.makedirs('.\\data\\resized-images')

# Get destination image size (we'll take the mean of the image dimensions)
dst_dim  = np.ceil(np.mean(np.asarray(image_dimensions), axis = 0)).astype(int)
dst_dim[1] = dst_dim[0] # To simplify matters, we make the images square

# Resize images
resized_images = [imresize(img, dst_dim) for img in raw_images]

# Write images to local directory
for j in range(len(resized_images)):
    imsave('.\\data\\resized-images\\resized_' + str(image_ids[j]) + '.png', resized_images[j])

The resized images should now be available locally. 

# Panel Subsetter

The script below writes out 6x6 grids of subimages for each resized image. If this doesn't work, I'll investigate using sliding window to subset the images.

In [4]:
import os
from scipy.misc import imsave

if not os.path.exists('.\\data\\subset-images'):
    os.makedirs('.\\data\\subset-images')
    
sub_dim = np.floor(dst_dim[1]/6).astype(int)
for k in range(len(resized_images)):
    for i in range(0, 6): # Horizontal grid element variable
        for j in range(0, 6): # Vertical grid element variable
            sub_image = resized_images[k][j*sub_dim : (j+1)*sub_dim, i*sub_dim:(i+1)*sub_dim]
            imsave('.\\data\\subset-images\\subset_' + str(image_ids[k]) + '_' + str(j) + str(i) + '.png', sub_image)

# Panel Train-Test-Splitter

This script performs a 50:50 stratified split of the subset data into training and testing images. 105 crack images, 1695 no-crack images.

In [5]:
# Read in image labels
import pandas as pd
image_labels = pd.read_csv('.\\data\\labels\\subset-cracked.csv')
image_labels.head()

# Split them
import numpy as np
np.random.seed(42)
from sklearn.model_selection import train_test_split
fp_train, fp_test, labels_train, labels_test = train_test_split(image_labels['filename'], image_labels['cracked'],
                                                                         train_size = 0.5, stratify = image_labels['cracked'],
                                                                               random_state = 1)

# Make directories for training and testing image sets
import os
if not os.path.exists('.\\data\\subset-images\\training'):
    os.makedirs('.\\data\\subset-images\\training')
if not os.path.exists('.\\data\\subset-images\\testing'):
    os.makedirs('.\\data\\subset-images\\testing')

# Move the images to their respective folders
for fp in fp_train:
    os.rename(".\\data\\subset-images\\" + fp, '.\\data\\subset-images\\training\\' + fp)
for fp in fp_test:
    os.rename(".\\data\\subset-images\\" + fp, '.\\data\\subset-images\\testing\\' + fp)
    
# Save the subset labels
pd.concat([fp_train, labels_train], axis = 1).to_csv('.\\data\\labels\\subset-training-labels-cracked.csv', index = False)
pd.concat([fp_test, labels_test], axis = 1).to_csv('.\\data\\labels\\subset-testing-labels-cracked.csv', index = False)

# Training Set Augmenter

The following script augments the training set to create a balanced set of responses. It writes the generated images and their associated labels to the hard disk.

In [11]:
from scipy.misc import imread, imsave
from skimage.transform import AffineTransform, warp
from skimage.util import random_noise

def augment_image(img_fp, dst_fp):
    img        = imread(img_fp)
    flip_v     = np.random.randint(0, 2)
    flip_h     = np.random.randint(0, 2)
    scale_f    = np.random.uniform(0.98, 1.02) # Some of the cracks are the extremes of the frame
    rotate_f   = np.random.uniform(-np.pi/36, np.pi/36)
    tform      = AffineTransform(scale = (scale_f, scale_f), rotation = rotate_f)
    aug_img    = warp(img, tform)
    if flip_v: aug_img = np.flip(aug_img, axis = 0)
    if flip_h: aug_img = np.flip(aug_img, axis = 1)
    aug_img    = random_noise(aug_img, 'speckle', var = 0.01)
    aug_img    = (aug_img*255/aug_img.max()).astype(int) # skimage bumps the images from [0, 255] to [0, 1]
    imsave(dst_fp, aug_img)
    
import numpy as np
import pandas as pd
np.random.seed(42)
# Read in training dataset [filename|cracked?]
training_labels = pd.read_csv('.\\data\\labels\\subset-training-labels-cracked.csv')

# Get the names of the images containing cracks only
cracked_filenames = (training_labels.filename[training_labels['cracked'] == 1]).reset_index(drop = True)

# Create a separate dataframe for the augmented images
n_aug_images = len(training_labels) - sum(training_labels['cracked'] == 1)
augmented_labels = pd.DataFrame(np.zeros([n_aug_images, 2]), columns = {'filename', 'cracked'})

for j in range(n_aug_images):
        ix = np.random.randint(0, len(cracked_filenames))
        img_fp = '.\\data\\subset-images\\training\\' + cracked_filenames[ix]
        aug_img_fp = img_fp[:-4] + '_aug_' + str(j) + '.png'
        #augment_image(img_fp, aug_img_fp) # Augments and saves image, returns image filename
        augmented_labels.loc[j] = [int(1), aug_img_fp[30:]]
        
aug_training_labels = pd.concat([training_labels, augmented_labels], axis = 0)
aug_training_labels.to_csv('.\\data\\labels\\augmented-subset-training-cracked.csv', index = False)