In [5]:
import tensorflow as tf
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import pathlib
from datetime import datetime
#import IPython.display as display
#import pandas as pd

In [11]:
# a fancy way of setting AUTOTUNE to -1 so the maximum number of threads are run later
AUTOTUNE = tf.data.experimental.AUTOTUNE
FOLDER_NAME = "natural_images"
# set data_dir to be the path to the selected folder
data_dir = pathlib.Path(str(FOLDER_NAME))
# counts all JPG files in all subfolders of our selected folder
image_count = len(list(data_dir.glob('*/*.jpg')))
print(str(image_count) + " images found.")
# makes a list of the subfolder names in our selected folder, which corresponds to the classes 
CLASS_NAMES = np.array([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"])
print("Existing classes: " + str(CLASS_NAMES))
#
BATCH_SIZE = 1000
# pick a size to which the images should be rescaled
IMG_HEIGHT = 128
IMG_WIDTH = 128
# BUFFER_SIZE will be used for shuffling the dataset later on. 
# By selecting image_count as the size, the complete data set is guaranteed to be shuffled.
BUFFER_SIZE = image_count
# Sets a split size for train and test data set
TRAIN_SIZE = int(image_count * 0.7)
# choose whether the labels shoud be one hot encoded. This appears to be beneficial if labels aren't numeric.
ONE_HOT = True
# produces a time stamp to use for file naming
TIME_STAMP = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

6899 images found.
Existing classes: ['7' '5' '0' '2' '4' '3' '1' '6']
2020-01-28_11-48-41
2020-01-28_11-48-41_cache.txt


In [8]:
# shuffles the dataset an splits it into train and test test
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*')).shuffle(BUFFER_SIZE)
train_list_ds = list_ds.take(TRAIN_SIZE)
test_list_ds = list_ds.skip(TRAIN_SIZE)

def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    if ONE_HOT == True:
        # return the class (second to last element in the path) in one hot encoding
        return parts[-2] == CLASS_NAMES
    else:
        # return the class (second to last element in the path) as an integer (if the folder is named as an integer)
        return int(parts[-2])
def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    return tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])
def process_path(file_path):
    label = get_label(file_path)
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

# Set `num_parallel_calls` so multiple images are loaded/processed in parallel by different cores. 
# -1 uses all cores.
train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
    
def prepare_for_training(ds, cache=True, shuffle_buffer_size=BUFFER_SIZE):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
    else:
        # untested! If dataset is to large for memory, a time stamped cache file is produced to take data from
        # if the code is run again with the same time stamp, the file will be reused. For new time stamps
        # a new file will be generated.
        ds = ds.cache(str(TIME_STAMP) + "_cache.txt")
        
    # shuffles the dataset again
    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat forever
    ds = ds.repeat()
    ds = ds.batch(BATCH_SIZE)

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [72]:
train_ds = prepare_for_training(train_labeled_ds)
test_ds = prepare_for_training(test_labeled_ds)

# obtain the image and label batches for both test and training
train_image_batch, train_label_batch = next(iter(train_ds))
test_image_batch, test_label_batch = next(iter(test_ds))

In [80]:
# build a sequential model
model = tf.keras.Sequential([
    # use first layer to flatten the image and take all inputs
    tf.keras.layers.Flatten(input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    # add an arbitrary number of dense layers with an arbitrary number of nodes
    tf.keras.layers.Dense(128, activation='relu'),
    # add an output layer with as many nodes as existing lables
    tf.keras.layers.Dense(8, activation='softmax')
])
#OPTIMIZER = "RMSprop"
OPTIMIZER = "Adam"
#OPTIMIZER = "SGD"
if ONE_HOT == True:
    if OPTIMIZER == "RMSprop":
        model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss=tf.keras.losses.CategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.CategoricalAccuracy()])
    elif OPTIMIZER == "Adam":
        model.compile(optimizer=tf.keras.optimizers.Adam(),
                    loss=tf.keras.losses.CategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.CategoricalAccuracy()])
    elif OPTIMIZER == "SGD":
        model.compile(optimizer=tf.keras.optimizers.SGD(),
                    loss=tf.keras.losses.CategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.CategoricalAccuracy()])        
else:
    if OPTIMIZER == "RMSprop":
        model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.SpareCategoricalAccuracy()])
    elif OPTIMIZER == "Adam":
        model.compile(optimizer=tf.keras.optimizers.Adam(),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    elif OPTIMIZER == "SGD":
        model.compile(optimizer=tf.keras.optimizers.SGD(),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

model.fit(train_image_batch, train_label_batch, epochs=10)
print()
test_loss, test_acc = model.evaluate(test_image_batch,  test_label_batch, verbose=2)
print('\nTest accuracy:', test_acc)



Train on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

1000/1000 - 1s - loss: 1.2975 - categorical_accuracy: 0.6190

Test accuracy: 0.619
