## Data Preparation & Augementation
In this file the dataset is loaded and transformed into all the formats required by the different classification algorithms. When calling this file, **make sure** you have a few global constants set:
* REQUIRED_DIMENSIONS
* TRAIN_BATCH_SIZE
* VALIDATION_BATCH_SIZE 
* TEST_BATCH_SIZE
* N_AUG_VS_TEST

In addition, this file creates output constants:
* DATASET_ROOT
* TRAIN_FOLDER      
* VALIDATION_FOLDER 
* TEST_FOLDER
* CLASSES

And, output generators:
* test_set
* train_set
* valdiation_set
* (augmentation versions)

And, output raw congregates:
* rawImages
* rawFeatures
* rawLabels

#### Imports and Correct File Usage Checks

In [None]:
import lib
import time
import os
import argparse
import imutils
import cv2
import numpy as np
import imgaug as ia
import tensorflow as tf
import matplotlib as mp

from matplotlib import pyplot as pp
from tensorflow import keras as kr

In [None]:
# Do not run this file by itself, this variables must be defined
# in the file importing this as they are implementation dependant
try:
    (REQUIRED_DIMENSIONS 
         or TRAIN_BATCH_SIZE 
         or VALIDATION_BATCH_SIZE 
         or TEST_BATCH_SIZE
         or N_AUG_VS_TEST)
except NameError:
    raise Exception('One of the required global constants is missing.\n -> Do not run this file by itself?')

# Implementation independent globals
DATASET_ROOT      = './dataset/'
TRAIN_FOLDER      = 'train'
VALIDATION_FOLDER = 'validation'
TEST_FOLDER       = 'test'
CLASSES           = ['biomass', 'non_biomass']

In [None]:
start_time = time.time()
lib.log("Data preparation started.")

#### Custom Functions

In [None]:
# Sometimes(0.5, ...) applies the given augmenter in 50% of all cases,
# e.g. Sometimes(0.5, GaussianBlur(0.3)) would blur roughly every second image.
sometimes = lambda aug: ia.augmenters.Sometimes(0.5, aug)

# Augment training set
seq = ia.augmenters.Sequential([
    # crop images from each side by 0 to 16px (randomly chosen)
    ia.augmenters.Crop(px=(0, 16)),
    # horizontally flip 50% of the images
    ia.augmenters.Fliplr(0.5),
    # vertically flip 20% of all image
    ia.augmenters.Flipud(0.2),
])

In [None]:
def augment(img):
    seq_det = seq.to_deterministic()
    aug_image = seq_det.augment_image(img)

    return aug_image

In [None]:
# Tool to display data set and its labels
def plots(ims, figsize=(12,6), rows=1, interp=False, titles=None):
	if type(ims[0]) is np.ndarray:
		ims = np.array(ims).astype(np.uint8)
		if (ims.shape[-1] != 3):
			ims = ims.transpose((0,2,3,1))
	f = pp.figure(figsize=figsize)
	cols = len(ims)//rows if len(ims) % 2 == 0 else len(ims)//rows + 1
	for i in range(len(ims)):
		sp = f.add_subplot(rows, cols, i+1)
		sp.axis('Off')
		if titles is not None:
			sp.set_title(titles[i], fontsize=16)
		pp.imshow(ims[i], interpolation=None if interp else 'none')

In [None]:
def image_to_feature_vector(image, size=REQUIRED_DIMENSIONS):
	# resize the image to a fixed size, then flatten the image into
	# a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

In [None]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
 
	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
 
	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)
 
	# return the flattened histogram as the feature vector
	return hist.flatten()

In [None]:
lib.log("Loaded custom functions.")

#### Load Existing Dataset & Perform Data Augmentation

Build Raw Congregates

In [None]:
# Raw Congregate
rawImages = []
rawFeatures = []
rawLabels = []

for setName in os.listdir(DATASET_ROOT):
    setPath = DATASET_ROOT + setName
    for className in os.listdir(setPath):
        classPath = setPath + "/" + className
        for imageName in os.listdir(classPath):
            imagePath = classPath + "/" + imageName
            # load the image and extract the class label (assuming that our
            # path as the format: /path/to/dataset/{class}.{image_num}.jpg
            image = cv2.imread(imagePath)
            label = className
            
            if not hasattr(image, "__len__"):
                print("(SKIPPED) Found a non image file: ", imagePath)
                continue
            
            # extract raw pixel intensity "features", followed by a color
            # histogram to characterize the color distribution of the pixels
            # in the image
            pixels = image_to_feature_vector(image)
            hist = extract_color_histogram(image)

            # update the raw images, features, and labels matricies,
            # respectively
            rawImages.append(pixels)
            rawFeatures.append(hist)
            rawLabels.append(label)

# Make numpy arrays, because more useful
rawImages = np.array(rawImages)
rawFeatures = np.array(rawFeatures)
rawLabels = np.array(rawLabels)

# Display some useful information
print("Number of images in total: " + str(len(rawImages)))
print("Raw pixels matrix: {:.2f}MB".format(rawImages.nbytes / (1024 * 1000.0)))
print("Raw features matrix: {:.2f}MB".format(rawFeatures.nbytes / (1024 * 1000.0)))

In [None]:
lib.log("Loaded raw congregates of images, features and labels.")

Build Keras Generators & Augment Data

In [None]:
# Shortcut
IDG = kr.preprocessing.image.ImageDataGenerator

# No augmentation
generator = IDG()

# Augmentation
agumented_generator = IDG(preprocessing_function=augment)

# Iterator for training data set
train_set = generator.flow_from_directory(
    DATASET_ROOT + TRAIN_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

# Iterator for validation data set
validation_set = generator.flow_from_directory(
    DATASET_ROOT + VALIDATION_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=VALIDATION_BATCH_SIZE,
    shuffle=False
)

# Iterator for test data set
test_set = generator.flow_from_directory(
    DATASET_ROOT + TEST_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False
);

# Iterator for augmented training data set
augmented_train_set = agumented_generator.flow_from_directory(
    DATASET_ROOT + TRAIN_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

In [None]:
lib.log("Loaded all generators.")

#### Visualising: Augmented Data VS Non Augmented Data

In [None]:
lib.log("Displaying first " + str(N_AUG_VS_TEST) + " batches of the augmented_train_set VS train_set:")

for i in range(N_AUG_VS_TEST):
    imgs, labels = next(augmented_train_set)
    plots(imgs, titles=labels)

for i in range(N_AUG_VS_TEST):
    imgs, labels = next(train_set)
    plots(imgs, titles=labels)

In [None]:
lib.log("Data preperation completed!\nTime taken: " + str(time.time() - start_time) + " seconds.")