In [4]:
import os
import urllib
import gzip
import csv
import numpy as np
import tensorflow as tf

In [5]:
URL_PATH = 'http://ai.stanford.edu/~btaskar/ocr/letter.data.gz'
DOWNLOAD_FILENAME = 'letter.data.gz'

In [6]:
def download_data():
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOAD_FILENAME)
    print('Found and verified file from this path: ',URL_PATH)
    print('Download File: ', DOWNLOAD_FILENAME)


In [7]:
download_data()

Found and verified file from this path:  http://ai.stanford.edu/~btaskar/ocr/letter.data.gz
Download File:  letter.data.gz


In [8]:
def read_lines():
    with gzip.open(DOWNLOAD_FILENAME, 'rt') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = list(reader)
        return lines

In [9]:
lines = read_lines()


In [10]:
len(lines)

52152

In [11]:
lines[1][:15]

['2', 'm', '3', '1', '2', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

In [12]:
def get_features_labels(lines):
    
    lines = sorted(lines, key=lambda x: int(x[0]))
    
    data, target = [], []
    #-1 indicate that beginig a new word
    next_id = -1
    word = []
    word_pixels = []
    
    for line in lines:
        next_id = int(line[2])
        
        pixels = np.array([int(x) for x in line[6:134]])
        pixels = pixels.reshape((16,8))
        
        word_pixels.append(pixels)
        word.append(line[1])
        
        if next_id == -1:
            data.append(word_pixels)
            target.append(word)
            
            word = []
            word_pixels = []
            
    return data, target

In [13]:
data, target = get_features_labels(lines)

In [14]:
len(data), len(target)

(6877, 6877)

In [15]:
def pad_features_labels(data, target):
    max_length = max(len(x) for x in target)
    
    padding = np.zeros((16,8))
    
    #Pad the image data with the empty string images
    data = [x + ([padding] * (max_length - len(x))) for x in data]
    
    #Pad the word with empty string characters
    target = [x + ([''] * (max_length - len(x))) for x in target]
    
    return np.array(data), np.array(target)

In [16]:
padded_data, padded_target = pad_features_labels(data,target)

In [17]:
len(padded_data), len(padded_target)

(6877, 6877)

In [18]:
padded_target[:10]

array([['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', '']],
      dtype='<U1')

In [19]:
padded_target[200:210]

array([['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', '']],
      dtype='<U1')

In [20]:
word_length = len(padded_target[0])

In [21]:
word_length

14

In [22]:
padded_data.shape[:2] + (-1,)

(6877, 14, -1)

In [23]:
reshaped_data = padded_data.reshape(padded_data.shape[:2] + (-1,))

In [24]:
 reshaped_data.shape

(6877, 14, 128)

In [25]:
padded_target.shape + (26,)

(6877, 14, 26)

In [26]:
one_hot_target = np.zeros(padded_target.shape + (26,))

In [27]:
for index, letter in np.ndenumerate(padded_target):
    if letter:
        one_hot_target[index][ord(letter) - ord('a')] = 1
        

In [28]:
one_hot_target[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
shuffled_indices = np.random.permutation(len(reshaped_data))
shuffled_data = reshaped_data[shuffled_indices]
shuffled_target = one_hot_target[shuffled_indices]

In [30]:
#train and test set
split = int(0.66 * len(shuffled_data))
train_data = shuffled_data[:split]
train_target = shuffled_target[:split]

test_data = shuffled_data[split:]
test_target = shuffled_target[split:]


In [31]:
#14 because each word is rapresent with a vector for length 14
train_data.shape 

(4538, 14, 128)

In [32]:
# 26 are the all letters of alphabet considerated (RAPRESENT THE CLASSES)
train_target.shape

(4538, 14, 26)

In [33]:
_, num_steps, num_inputs = train_data.shape

In [34]:
num_classes = train_target.shape[2]

In [35]:
num_steps, num_inputs, num_classes

(14, 128, 26)

In [36]:
tf.reset_default_graph()

In [37]:
X = tf.placeholder(tf.float64,[None, num_steps, num_inputs])

In [38]:
Y = tf.placeholder(tf.float64,[None, num_steps, num_classes])

In [39]:
used = tf.sign(tf.reduce_max(tf.abs(X),reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
sequence_length = tf.cast(length, tf.int32)

In [40]:
sequence_length

<tf.Tensor 'Cast:0' shape=(?,) dtype=int32>

In [41]:
num_neurons = 300

In [42]:
cell = tf.nn.rnn_cell.GRUCell(num_neurons)

In [43]:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype=tf.float64, sequence_length=sequence_length)

In [44]:
output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(300)])

In [45]:
# share softmax layer
weight = tf.Variable(tf.truncated_normal([num_neurons, num_classes], stddev=0.01, dtype=tf.float64))

In [46]:
bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))
flattened_output = tf.reshape(output, [-1, num_neurons])

In [47]:
flattened_output

<tf.Tensor 'Reshape:0' shape=(?, 300) dtype=float64>

In [48]:
logits = tf.matmul(flattened_output, weight) + bias

In [49]:
logit_reshaped = tf.reshape(logits,[-1, num_steps, num_classes])

In [50]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y)

In [51]:
loss = tf.reduce_mean(cross_entropy)

In [52]:
mistakes = tf.not_equal(tf.argmax(Y,2), tf.argmax(logit_reshaped,2))
mistakes = tf.cast(mistakes, tf.float64)

In [53]:
mask = tf.sign(tf.reduce_max(tf.abs(Y), reduction_indices=2))

In [54]:
mistakes *= mask 

In [55]:
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(sequence_length, tf.float64)

In [56]:
error = tf.reduce_mean(mistakes)

In [57]:
optimizer = tf.train.RMSPropOptimizer(0.002)
gradient = optimizer.compute_gradients(loss)

optimizer = optimizer.apply_gradients(gradient)

In [58]:
def batched(data, target, batch_size):
    
    epoch = 0
    offset = 0
    
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)
        
        #offset wrapper around to the beginning so new epoch
        if offset < old_offset:
            
            #new epoch, ned to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]
            
            epoch += 1
        batch_data = data[offset:(offset + batch_size),:]
        batch_target = target[offset:(offset + batch_size),:]
        
        yield batch_data, batch_target, epoch
                

In [59]:
batch_size = 20
batches = batched(train_data, train_target, batch_size)

In [60]:
epochs = 5

In [62]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for index, batch in enumerate(batches):
        batch_data = batch[0]
        batch_target = batch[1]
        
        epoch = batch[2]
        if epoch >= epochs:
            break
        
        feed = (X: batch_data, Y: batch_target)
        train_error, _ = sess.run([error, optimizer], feed)
        
        print('(): {:3.6f}%', format(index + 1, 100*train_error))
    
    test_feed = {X: test_data, Y: test_target}
    test_error, _ = sess.run([error, optimizer], test_feed)
    
    print('Test error: {:3.6f}%', format(100 * test_error))

SyntaxError: invalid syntax (<ipython-input-62-2610eb40e6dc>, line 13)