## Data Preparation & Augementation
In this file the dataset is loaded and transformed into all the formats required by the different classification algorithms. When calling this file, **make sure** you have a few global constants set:
* REQUIRED_DIMENSIONS
* TRAIN_BATCH_SIZE
* VALIDATION_BATCH_SIZE 
* TEST_BATCH_SIZE
* N_AUG_VS_TEST

In addition, this file creates output constants:
* DATASET_ROOT
* TRAIN_FOLDER      
* VALIDATION_FOLDER 
* TEST_FOLDER
* CLASSES

And, output keras generators:
* test_set
* train_set
* valdiation_set
* (augmentation versions)

And, output raw congregates (all of the dataset together):
* Images
* rawPixels
* Features
* Labels
* (augmentation versions)

#### Imports and Correct File Usage Checks

In [None]:
import lib
import time
import os
import argparse
import imutils
import cv2
import numpy as np
import imgaug as ia
import imgaug.augmenters as aug
import tensorflow as tf
import matplotlib as mp

from matplotlib import pyplot as pp
from matplotlib import image as mpimg
from tensorflow import keras as kr

In [None]:
# Do not run this file by itself, this variables must be defined
# in the file importing this as they are implementation dependant
try:
    (REQUIRED_DIMENSIONS 
         or TRAIN_BATCH_SIZE 
         or VALIDATION_BATCH_SIZE 
         or TEST_BATCH_SIZE
         or N_AUG_VS_TEST)
except NameError:
    raise Exception('One of the required global constants is missing.\n -> Do not run this file by itself?')

# Implementation independent globals
DATASET_ROOT      = './dataset/'
TRAIN_FOLDER      = 'train'
VALIDATION_FOLDER = 'validation'
TEST_FOLDER       = 'test'
CLASSES           = ['biomass', 'non_biomass']

In [None]:
start_time = time.time()
lib.log("Data preparation started.")

#### Custom Functions

In [None]:
# Applies the given augmenter in 35% of all cases,
sometimes = lambda a: aug.Sometimes(0.35, a)

# Augmentations applied to the training set
# More information: https://github.com/aleju/imgaug
seq = aug.Sequential([
    # Crop images from each side by 0 to 16px (randomly chosen)
    aug.Crop(px=(0, 16)),
    # Horizontally all of the images
    aug.Fliplr(1),
    # vertically flip 50% of the images
    aug.Flipud(0.5),
    # Multiple changes (sometimes)
    sometimes(aug.Affine(
        scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
        translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
        rotate=(-45, 45), # rotate by -45 to +45 degrees
        shear=(-16, 16), # shear by -16 to +16 degrees
        order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
        cval=(0, 255), # if mode is constant, use a cval between 0 and 255
        mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
    )),
])

In [None]:
def augment(img):
    seq_det = seq.to_deterministic()
    aug_image = seq_det.augment_image(img)

    return aug_image

In [None]:
def image_to_feature_vector(image, size=REQUIRED_DIMENSIONS):
	# resize the image to a fixed size, then flatten the image into
	# a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

In [None]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
 
	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
 
	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)
 
	# return the flattened histogram as the feature vector
	return hist.flatten()

In [None]:
lib.log("Loaded custom functions.")

#### Load Existing Dataset & Perform Data Augmentation

Build Congregates

In [None]:
# Congregate
Images = [] # All the images (train, test, validation)
rawPixels = [] # Flattened pixel intensities of 'Images'
Features = [] # Feature extractions from 'Images'
Labels = [] # Labels for all the above.

# Augmentations bersions
augmented_Images = []
augmented_rawPixels = []
augmented_Features = []
augmented_Labels = []

for setName in os.listdir(DATASET_ROOT):
    setPath = DATASET_ROOT + setName
    for className in os.listdir(setPath):
        classPath = setPath + "/" + className
        for imageName in os.listdir(classPath):
            imagePath = classPath + "/" + imageName
            # Load the image and extract the class label (assuming that our
            # Path as the format: /path/to/dataset/{class}.{image_num}.jpg
            image = cv2.imread(imagePath)
            label = className
            
            # Check if the image is valid!
            if not hasattr(image, "__len__"):
                print("(SKIPPED) Found a non image file: ", imagePath)
                continue
            
            # Extract raw pixel intensity "features", followed by a color
            # Histogram to characterize the color distribution of the pixels in the image
            pixels = image_to_feature_vector(image)
            hist = extract_color_histogram(image)

            # Update the raw images, features, and labels matricies respectively
            Images.append(image)
            rawPixels.append(pixels)
            Features.append(hist)
            Labels.append(label)
            
            # Do the same, but for augmentation
            image = augment(image)
            pixels = image_to_feature_vector(image)
            hist = extract_color_histogram(image)
            
            # Update augmentation storages
            augmented_Images.append(image)
            augmented_rawPixels.append(pixels)
            augmented_Features.append(hist)
            augmented_Labels.append(label)
            

# Make numpy arrays, because more useful
Images = np.array(Images)
rawPixels = np.array(rawPixels)
Features = np.array(Features)
Labels = np.array(Labels)

# Same for augmentations
augmented_Images = np.array(augmented_Images)
augmented_rawPixels = np.array(augmented_rawPixels)
augmented_Features = np.array(augmented_Features)
augmented_Labels = np.array(augmented_Labels)

# Display some useful information
print("[Non Augmented Images]")
print("\t - Number of images in total: " + str(len(Images)))
print("\t - Raw pixels matrix: {:.2f}MB".format(rawPixels.nbytes / (1024 * 1000.0)))
print("\t - Raw features matrix: {:.2f}MB".format(Features.nbytes / (1024 * 1000.0)))

# Same for augmentations
print("[Augmented Images]")
print("\t - Number of images in total: " + str(len(augmented_Images)))
print("\t - Raw pixels matrix: {:.2f}MB".format(augmented_rawPixels.nbytes / (1024 * 1000.0)))
print("\t - Raw features matrix: {:.2f}MB".format(augmented_Features.nbytes / (1024 * 1000.0)))

In [None]:
lib.log("Loaded raw congregates of images, features and labels.")

Build Keras Generators

In [None]:
# Shortcut
IDG = kr.preprocessing.image.ImageDataGenerator

# No augmentation
generator = IDG()

# Augmentation
agumented_generator = IDG(preprocessing_function=augment)

print("[Non Augmented Generators]")

# Iterator for training data set
train_set = generator.flow_from_directory(
    DATASET_ROOT + TRAIN_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

# Iterator for validation data set
validation_set = generator.flow_from_directory(
    DATASET_ROOT + VALIDATION_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=VALIDATION_BATCH_SIZE,
    shuffle=False
)

# Iterator for test data set
test_set = generator.flow_from_directory(
    DATASET_ROOT + TEST_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False
);

print("[Augmented Generators]")

# Iterator for augmented training data set
augmented_train_set = agumented_generator.flow_from_directory(
    DATASET_ROOT + TRAIN_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

# Iterator for augmented training data set
augmented_test_set = agumented_generator.flow_from_directory(
    DATASET_ROOT + TEST_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

# Iterator for augmented training data set
augmented_validation_set = agumented_generator.flow_from_directory(
    DATASET_ROOT + VALIDATION_FOLDER, 
    target_size=REQUIRED_DIMENSIONS, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

In [None]:
lib.log("Loaded all generators.")

In [None]:
lib.log("Data preperation completed!\nTime taken: " + str(time.time() - start_time) + " seconds.")