In [8]:
from tensorflow.keras.layers import StringLookup
from tensorflow import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import re

# 1. Load training data, sort by class

In [9]:
# 1. Load the training data, randomly arrange it
directory = "./training_images/sorted/"
files = []
for filename in os.listdir(directory):
    if (filename != '.gitignore'):
        files.append(directory+filename)

# Gather the class labels
class_labels = []
for filepath in files:
    filepath = filepath.split('/')[3][:-4]
    filepath = re.sub(pattern=r"[^a-zA-Z]", repl=r"", string=filepath)
    if (filepath not in class_labels):
        class_labels.append(filepath)

np.random.shuffle(files)

# 2. Segment data equally among classes

In [10]:

# 2. Split the training data into three subsets 90:5:5 (training:validation:test)
train_samples = []
test_samples = []
validation_samples = []

# gets 90:5:5 of each class
for class_label in class_labels:
    class_list = [filepath for filepath in files if class_label in filepath]

    split_idx = int(0.9 * len(class_list))
    train_samples += class_list[:split_idx]
    test_or_val_samples = class_list[split_idx:]

    val_split_idx = int(0.5 * len(test_or_val_samples))
    validation_samples += test_or_val_samples[:val_split_idx]
    test_samples += test_or_val_samples[val_split_idx:]
    

assert len(files) == len(train_samples) + len(validation_samples) + len(
    test_samples
)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

# a function that gets the image paths and their corresponding labels for whatever array we put in
def get_labels(paths):
    labels = []
    for filepath in paths:
        filepath = filepath.split('/')[3][:-4]
        filepath = re.sub(pattern=r"[^a-zA-Z]", repl=r"", string=filepath)
        labels.append(filepath)
    return paths, labels

train_img_paths, train_labels = get_labels(train_samples)
validation_img_paths, validation_labels = get_labels(validation_samples)
test_img_paths, test_labels = get_labels(test_samples)

Total training samples: 74
Total validation samples: 3
Total test samples: 8


# 3. Prepare the dataset

In [None]:
ds_

In [59]:
batch_size = 64
padding_token = 99

AUTOTUNE = tf.data.AUTOTUNE

def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image)
    return image

def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    return {"image": image, "label": label}

def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)

train_ds = process_images_labels(train_img_paths, train_labels)
validation_ds = process_images_labels(validation_img_paths, validation_labels)
test_ds = process_images_labels(test_img_paths, test_labels)

InvalidArgumentError: {{function_node __wrapped__ReadFile_device_/job:localhost/replica:0/task:0/device:CPU:0}} Input filename tensor must be scalar, but had shape: [74] [Op:ReadFile]

In [58]:
for data in train_ds.take(1):
    images, labels = data.element_spec["image"], data.element_spec["label"]

    _, ax = plt.subplots(4, 4, figsize=(15, 8))

    for i in range(16):
        img = images[i]
        label = labels[i]

        ax[i // 4, i % 4].imshow(img)
        ax[i // 4, i % 4].set_title(label)
        ax[i // 4, i % 4].axis("off")

plt.show()
