# Image Pre-Processing

In [1]:
# Create the necessary directories
!for cat in $(echo 'lock-n-key lock-pick empty'); do \
    echo $cat; mkdir -p "datasets/train/$cat" "datasets/validate/$cat" "datasets/test/$cat"; \
done

lock-n-key
lock-pick
empty


In [20]:
!pip install pyheif
!pip install tensorflow



## Image Conversion & Resizing

Images in Apple's HEIC format need to be converted into JPEG form in order to be read by standard Python libraries. 

Rectangular images are also resized into squares. For simplicity, they are scaled instead of cropped, meaning that this operation could also be carried out by `ImageDataGenerator`.

In [2]:
target_size = (224, 224)

In [3]:
from enum import Enum, unique
import os
from PIL import Image
import pyheif
import random

@unique
class ImageFormat(Enum):
    HEIF = 'HEIF'
    JPEG = 'JPEG'
    PNG = 'PNG'
    OTHER = ''

    @classmethod
    def from_ext(self, ext):
        ext = ext.lower()
        if ext in self._ext_map.keys():
            return self._ext_map[ext]
        else:
            return self.OTHER

ImageFormat._ext_map = {
    '.heic': ImageFormat.HEIF,
    '.jpg': ImageFormat.JPEG,
    '.jpeg': ImageFormat.JPEG,
    '.png': ImageFormat.PNG,
}

In [4]:
def remove_prefix(s, prefix):
    return s[len(prefix):] if s.startswith(prefix) else s

INPUT_DIR = 'tmp/Data Collection (Images)'

OUTPUT_DIR = 'datasets'

In [5]:
num_images_per_class = 1000

validation_fraction = 0.2
test_fraction = 0.2
training_fraction = 1 - test_fraction - validation_fraction

# TODO: reduce hardcoding

class_map = {
    '/Lock Picking/An Qi': 'lock-pick',
    '/Lock Picking/Dorcas': 'lock-pick',
    '/Lock Picking/Dorcas-final': 'lock-pick',
    '/Lock Picking/Lucas': 'lock-pick',
    '/Lock Picking/Rectangular': 'lock-pick',
    '/Lock Picking/Steve': 'lock-pick',
    '/Lock + Key/An Qi': 'lock-n-key',
    '/Lock + Key/Dorcas': 'lock-n-key',
    '/Lock + Key/Dorcas-final': 'lock-n-key',
    '/Lock + Key/Lucas': 'lock-n-key',
    '/Lock + Key/Steve': 'lock-n-key',
    '/Empty/An Qi': 'empty',
    '/Empty/Dorcas': 'empty',
    '/Empty/Dorcas-final': 'empty',
    '/Empty/Lucas': 'empty',
    '/Empty/Steve': 'empty',
}

def open_heif_image(filepath):
    heif_file = pyheif.read(filepath)
    image = Image.frombytes(
        heif_file.mode,
        heif_file.size,
        heif_file.data,
        "raw",
        heif_file.mode,
        heif_file.stride,
    )
    return image


def open_image(filepath):
    return Image.open(filepath)

# TODO: better abstraction and fewer global variables

datasets = ['train', 'test', 'validate']
dataset_counts = {
    'lock-pick': [600, 200, 200], 
    'lock-n-key': [600, 200, 200], 
    'empty': [600, 200, 200]
}

def weighted_random_choice():
    return random.choices([0, 1, 2], [600, 200, 200])[0]


def get_random_dataset(class_name):
    counts = dataset_counts[class_name]
    rand_dataset = weighted_random_choice()
    while counts[rand_dataset] <= 0:
        rand_dataset = weighted_random_choice()
    counts[rand_dataset] -= 1
    return datasets[rand_dataset]


def get_output_path(orig_path):
    relative_path = remove_prefix(orig_path, INPUT_DIR)
    class_name = class_map[relative_path]
    dataset_name = get_random_dataset(class_name)
    return "%s/%s/%s" % (OUTPUT_DIR, dataset_name, class_name)


def resize_and_save(image, path):
    resized_image = image.resize(target_size)
    resized_image.save(path)


def process_file(path, filename):
    filepath = os.path.join(path, filename)
    # print("Processing %s..." % filepath)

    (raw_filename, ext) = os.path.splitext(filename)

    image_fmt = ImageFormat.from_ext(ext)
    if image_fmt == ImageFormat.HEIF:
        image = open_heif_image(filepath)
    else:
        image = open_image(filepath)

    output_filepath = "./%s/%s.jpg" % (get_output_path(path), raw_filename)
    resize_and_save(image, output_filepath)


def process_files(path):
    num_files=0
    for entry in os.scandir(path):
        if entry.is_dir():
            process_files(entry.path)
            continue
        # Assumption: each directory either contains files or subdirectories.
        # If it doesn't contain subdirectories, we only process its files if it is part of the training set.
        if not remove_prefix(path, INPUT_DIR) in class_map:
            print("Skipping %s..." % path)
            return
        process_file(path, entry.name)
        num_files += 1
    print("Processed %d files in %s..." % (num_files, path))


process_files(INPUT_DIR)

Processed 608 files in tmp/Data Collection (Images)/Empty/Dorcas-final...
Processed 107 files in tmp/Data Collection (Images)/Empty/Dorcas...
Processed 125 files in tmp/Data Collection (Images)/Empty/Steve...
Processed 105 files in tmp/Data Collection (Images)/Empty/An Qi...
Processed 30 files in tmp/Data Collection (Images)/Empty/Lucas...
Processed 0 files in tmp/Data Collection (Images)/Empty...
Processed 561 files in tmp/Data Collection (Images)/Lock + Key/Dorcas-final...
Processed 102 files in tmp/Data Collection (Images)/Lock + Key/Dorcas...
Processed 66 files in tmp/Data Collection (Images)/Lock + Key/Steve...
Processed 104 files in tmp/Data Collection (Images)/Lock + Key/An Qi...
Processed 83 files in tmp/Data Collection (Images)/Lock + Key/Lucas...
Processed 0 files in tmp/Data Collection (Images)/Lock + Key...
Processed 439 files in tmp/Data Collection (Images)/Lock Picking/Dorcas-final...
Processed 60 files in tmp/Data Collection (Images)/Lock Picking/Rectangular...
Processed

In [7]:
class_map['/Lock + Key/Dorcas-final-2'] = 'lock-n-key'
class_map['/Lock Picking/Dorcas-final-2'] = 'lock-pick'
class_map['/Empty/Dorcas-final-2'] = 'empty'

process_files(INPUT_DIR + '/Lock + Key/Dorcas-final-2')
process_files(INPUT_DIR + '/Lock Picking/Dorcas-final-2')
process_files(INPUT_DIR + '/Empty/Dorcas-final-2')

Processed 84 files in tmp/Data Collection (Images)/Lock + Key/Dorcas-final-2...
Processed 70 files in tmp/Data Collection (Images)/Lock Picking/Dorcas-final-2...
Processed 25 files in tmp/Data Collection (Images)/Empty/Dorcas-final-2...


> Known issue: Some JPG images are, for whatever reason, rotated 90 degrees before being saved.

## Data Augmentation

For this set, we need to generate augmented data to increase the size pf the validation and testing sets.

In [36]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img
import numpy as np

VALIDATE_OUTPUT_DIR = 'datasets/validate'
TEST_OUTPUT_DIR = 'datasets/test'

num_images_required = 75
batch_size = 25


def create_generator(input_path):
    # Include data augmentation techniques
    datagen = ImageDataGenerator(
        rescale=1./255,
        brightness_range=(0.5, 1.5),
        rotation_range=20,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True
    )

    # Resize to the required input shape
    generator = datagen.flow_from_directory(
        input_path,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical'
    )
    
    return generator


def process_files_for_augmentation(path, output_dir):
    generator = create_generator(path)
    classes = list(generator.class_indices.keys())

    counts = [0, 0, 0]
    # Generate batches of images to save to the given directory.
    while np.min(counts) < num_images_required:
        data, labels = next(generator)
        num_items = np.shape(data)[0]

        for i in range(num_items):
            class_id = np.argmax(labels[i])
            
            if counts[class_id] >= num_images_required:
                continue

            image_data = data[i]
            image = array_to_img(image_data)

            save_path = "%s/%s/%d.jpg" % (output_dir, classes[class_id], counts[class_id])
            image.save(save_path)

            counts[class_id] += 1
            
        print(counts)

    print("Processed %s." % path)
    return


process_files_for_augmentation(INPUT_DIR + "/validate", 'datasets/validate')
process_files_for_augmentation(INPUT_DIR + "/test", 'datasets/test')

Found 153 images belonging to 3 classes.
[7, 9, 9]
[19, 16, 15]
[31, 24, 20]
[43, 28, 29]
[54, 34, 37]
[67, 38, 45]
[69, 39, 45]
[75, 44, 52]
[75, 48, 61]
[75, 56, 70]
[75, 62, 75]
[75, 70, 75]
[75, 75, 75]
Processed tmp/Data Collection (Images)/validate.
Found 118 images belonging to 3 classes.
[13, 5, 7]
[28, 10, 12]
[39, 15, 21]
[47, 22, 31]
[56, 27, 35]
[72, 30, 41]
[75, 36, 52]
[75, 43, 57]
[75, 51, 63]
[75, 54, 70]
[75, 61, 75]
[75, 66, 75]
[75, 70, 75]
[75, 75, 75]
Processed tmp/Data Collection (Images)/test.
