### Import libraries

In [67]:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow import keras
import shutil
import matplotlib.image as mpimg
import glob
import os

## Prepare the data

In [68]:
# Model / data parameters
img_width, img_height = 28, 28
epochs = 50
batch_size = 16
input_shape = (28, 28, 1)
num_classes = 10

notMNIST_load_data is a function that loads the notMNIST dataset. It returns three tuples containing the training, validation and test data. Each tuple is formed by a numpy array containing the images and a numpy array containing the labels (0 to 9).

In [69]:
def notMNIST_load_data() : 
    data_dir_letters = 'data/notMNIST_small'

    nb_letters = len(os.listdir(data_dir_letters))
    if nb_letters != num_classes:
        raise ValueError('The number of classes is not equal to the number of letters in the folder')

    x_train = []
    y_train = []
    x_test = []
    y_test = []

    # we get the number of samples in the smallest class
    min_nb_samples = float('inf')
    for letter in os.listdir(data_dir_letters):
        nb_samples = len(os.listdir(os.path.join(data_dir_letters, letter)))
        min_nb_samples = min(min_nb_samples, nb_samples)

    # 80% of the data is used for training
    # 10% for validation
    # 10% for testing
    nb_train_samples = int(min_nb_samples * 0.8)
    # nb_validation_samples = int(min_nb_samples * 0.1)
    nb_validation_samples = 0
    nb_test_samples = nb_samples - nb_train_samples - nb_validation_samples

    # for each letter folder, we copy the images in the train, validation or test tuple and the label
    # TODO : randomize the order of the images would be better ?
    for letter in os.listdir(data_dir_letters):
        index = 0

        for image in glob.iglob(os.path.join(data_dir_letters, letter, "*.png")):

            if index < nb_train_samples:
                pixels_array = mpimg.imread(image)
                x_train.append(pixels_array)
                y_train.append(ord(letter) - 65)
            #elif index < nb_train_samples + nb_validation_samples:
                
            elif index < nb_train_samples + nb_validation_samples + nb_test_samples:
                pixels_array = mpimg.imread(image)
                x_test.append(pixels_array)
                y_test.append(ord(letter) - 65)
            index += 1
    
    return (np.array(x_train), np.array(y_train)), (np.array(x_test), np.array(y_test))

In [70]:
# load the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = notMNIST_load_data()

In [71]:
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

In [72]:
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

x_train shape: (14970, 28, 28, 1)
14970 train samples
3750 test samples


In [73]:
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

## Build the model