In [1]:
# dataset: https://github.com/MartinThoma/HASY
# 369 categories
# 168,233 images, some categories have more images than others

In [2]:
from IPython.display import Image

In [3]:
# example from the dataset

Image(url="HASYv2/hasy-data/v2-00010.png")

In [4]:
import csv
from PIL import Image as pil_image
import keras.preprocessing.image

In [6]:
# Load all images (as numpy arrays) and save their classes

imgs = []
classes = []
with open('HASYv2/hasy-data-labels.csv') as csvfile:
    csvreader = csv.reader(csvfile)
    i = 0
    for row in csvreader:
        if i > 0:
            img = keras.preprocessing.image.img_to_array(pil_image.open("HASYv2/" + row[0]))
            # neuron activation functions behave best when input values are between 0.0 and 1.0 (-1.0 and 1.0),
            # so we rescale each pixel value to be in the range 0.0 to 1.0 istead of 0-255
            img /= 255.0
            imgs.append((row[0], row[2], img))
            classes.append(row[2])
        i += 1

In [7]:
imgs[0]

('hasy-data/v2-00000.png', 'A', array([[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        ...,
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]], dtype=float32))

In [8]:
len(imgs)

168233

In [13]:
# shuffle the data, split into 80% train, 20% test

import random
random.shuffle(imgs)
split_idx = int(0.8*len(imgs))
train = imgs[:split_idx]
test = imgs[split_idx:]

In [14]:
import numpy as np

train_input = np.asarray(list(map(lambda row: row[2], train)))
test_input = np.asarray(list(map(lambda row: row[2], test)))

train_output = np.asarray(list(map(lambda row: row[1], train)))
test_output = np.asarray(list(map(lambda row: row[1], test)))

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [16]:
# convert classes names into one-hot encoding

#first, convert class names into integers
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(classes)

# then convert integers into on-hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder.fit(integer_encoded)

# convert train and test output to one-hot
train_output_int = label_encoder.transform(train_output)
train_output = onehot_encoder.transform(train_output_int.reshape(len(train_output_int), 1))
test_output_int = label_encoder.transform(test_output)
test_output = onehot_encoder.transform(test_output_int.reshape(len(test_output_int), 1))

num_classes = len(label_encoder.classes_)
print("Number of classes: %d" % num_classes)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Number of classes: 369


In [17]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [19]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3), activation='relu',
                input_shape=np.shape(train_input[0])))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(1024, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 30, 30, 32)        896       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 15, 15, 32)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 13, 13, 32)        9248      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 6, 6, 32)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1180672   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
__________

In [21]:
import keras.callbacks
tensorboard = keras.callbacks.TensorBoard(log_dir='./logs/mnist-style')

  return f(*args, **kwds)


In [22]:
model.fit(train_input, train_output, 
          batch_size=32, 
          epochs=10,
          verbose=2, 
          validation_split=0.2, 
          callbacks=[tensorboard])

Instructions for updating:
Use tf.cast instead.
Train on 107668 samples, validate on 26918 samples
Epoch 1/10
 - 263s - loss: 1.5272 - acc: 0.6313 - val_loss: 0.9558 - val_acc: 0.7352
Epoch 2/10
 - 266s - loss: 0.9506 - acc: 0.7358 - val_loss: 0.8662 - val_acc: 0.7622
Epoch 3/10
 - 273s - loss: 0.8367 - acc: 0.7585 - val_loss: 0.8145 - val_acc: 0.7673
Epoch 4/10
 - 273s - loss: 0.7645 - acc: 0.7749 - val_loss: 0.8269 - val_acc: 0.7697
Epoch 5/10
 - 260s - loss: 0.7132 - acc: 0.7843 - val_loss: 0.8390 - val_acc: 0.7676
Epoch 6/10
 - 263s - loss: 0.6737 - acc: 0.7928 - val_loss: 0.8274 - val_acc: 0.7754
Epoch 7/10
 - 259s - loss: 0.6400 - acc: 0.8002 - val_loss: 0.8492 - val_acc: 0.7684
Epoch 8/10
 - 266s - loss: 0.6133 - acc: 0.8067 - val_loss: 0.8717 - val_acc: 0.7567
Epoch 9/10
 - 263s - loss: 0.5915 - acc: 0.8119 - val_loss: 0.8570 - val_acc: 0.7760
Epoch 10/10
 - 264s - loss: 0.5687 - acc: 0.8171 - val_loss: 0.8478 - val_acc: 0.7697


<keras.callbacks.History at 0x1288f19b0>

In [23]:
score = model.evaluate(test_input, test_output, verbose=2)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.8572115720507046
Test accuracy:  0.7618212619341405


In [None]:
# try various model configurations and parameters to find the best

import time

results = []
for conv2d_count in [1, 2]:
    for dense_size in [128, 256, 512, 1024, 2048]:
        for dropout in [0.0, 0.25, 0.50, 0.75]:
            model = Sequential()
            for i in range(conv2d_count):
                if i == 0:
                    model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=np.shape(train_input[0])))
                else:
                    model.add(Con2D(32, kernel_size=(3,3), activation='relu'))
                model.add(MaxPooling2D(pool_size=(2,2)))
            model.add(Flatten())
            model.add(Dense(dense_size, activation='tanh'))
            if dropout > 0.0:
                model.add(Dropout(dropout))
            model.add(Dense(num_classes, activation='softmax'))
            
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            
            log_dir = './logs/conv2d_%d-dense_%d-dropout_%.2f' % (conv2d_count, dense_size, dropout)
            tensorboard = keras.callbacks.TensorBoard(log_dir=log_dir)
            
            start = time.time()
            model.fit(train_input, train_output, batch_size=32, epochs=10,
                     verbose=0, validation_split=0.2, callbacks=[tensorboard])
            score = model.evaluate(test_input, test_output, verbose=2)
            end = time.time()
            elapsed = end - start
            print("Conv2D count: %d, Dense size: %d, Dropout: %.2f - Loss: %.2f, Acurracy: %.2f, Time: %d sec" % \
                 (conv2d_count, dense_size, dropout, score[0], score[1], elapsed))
            results.append((conv2d_count, dense_size, dropout, score[0], score[1], elapsed))

In [None]:
# rebuild/retrain a model with the best paramters (from the search) and use all data
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=np.shape(train_input[0])))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# join train and test data so we train the network on all  data we have avaliable to us
model.fit(np.concatenate((train_input, test_input)),
         np.concatenate((train_output, test_output)),
         batch_size=32, epochs=10, verbose=2)

# save the trained model
model.save("mathsymbols.model")

# save label encoder (to reverse one-hot encoding)
np.save("classes.npy", label_encoder.classes_)

In [None]:
# Load the pre-trained model and predict the math symbol for an arbitrary image;
# the code below could be placed on separate file

import keras.models
model2 = keras.models.load_model("mathsymbols.model")
print(model2.summary())

# restore the class name to integer encoder
label_encoder2 = LabelEncoder()
label_encoder2.classes_ = np.load("classes.npy")

def predict(img_path):
    newimg = keras.preprocessing.image.img_to_array(pil_image.open(img_path))
    newimg /= 255.0
    
    # do the prediction
    prediction = model2.predict(newimg.reshape(1, 32, 32, 3))
    
    
    # figure out which output neuron had the highest score, and reverse the one-hot encoding
    inverted = label_encoder2.inverse_transform([np.argmax(prediction)]) # argmax finds highest-scoring output
    print("Prediction: %s, confidence: %.2f" % (inverted[0], np.max(prediction)))

In [None]:
# grab an image (we'll just use a random training image for demonstration purposes)
predict("HASYv2/hasy-data/v2-00010.png")

In [None]:
predict("HASYv2/hasy-data/v2-00500.png")

In [None]:
predict("HASYv2/hasy-data/v2-00070.png")