<a href="https://colab.research.google.com/github/MonicaSai7/OCR-using-CRNN/blob/master/CRNN_HDF5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# Libraries

In [2]:
import os
import string
import cv2
import h5py
import fnmatch
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, BatchNormalization, Lambda, Bidirectional, LSTM, Dense

# Data Acquisition

In [3]:
with h5py.File('gdrive/My Drive/mjsynth_150000.hdf5', 'r') as f:
  print(list(f.keys()))
  training_img = f.get('train_img').value
  orig_txt = f.get('train_labels').value
  valid_img = f.get('valid_img').value
  valid_orig_txt = f.get('valid_labels').value

['train_img', 'train_labels', 'valid_img', 'valid_labels']


In [4]:
orig_txt[9]

'REWEAVES'

In [5]:
char_list = string.ascii_letters + string.digits
print(char_list)
print(len(char_list))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789
62


In [0]:
def encode_to_labels(text):
    code = []
    for index, char in enumerate(text):
        code.append(char_list.index(char))
    return code

In [0]:
length_checker = np.vectorize(len) 
max_train_label_len = max(length_checker(orig_txt))
max_valid_label_len = max(length_checker(valid_orig_txt))
max_label_len = max(max_train_label_len, max_valid_label_len)

Label length is the length of each output text label and input length is the same for each input to the LSTM layer which is 31 in our architecture.

In [0]:
train_label_length = []
train_input_length = []
train_txt = []

for i in range(len(orig_txt)):
    train_label_length.append(len(orig_txt[i]))
    train_input_length.append(31)
    train_txt.append(encode_to_labels(orig_txt[i]))

In [0]:
valid_label_length = []
valid_input_length = []
valid_txt = []

for i in range(len(valid_orig_txt)):
    valid_label_length.append(len(orig_txt[i]))
    valid_input_length.append(31)
    valid_txt.append(encode_to_labels(valid_orig_txt[i]))

In [0]:
train_padded_txt = pad_sequences(train_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))

# Model Architecture

### Model = CNN + RNN + CTC loss

In [11]:
# input with shape of height=32 and width=128 
inputs = Input(shape=(32,128,1))
 
# convolution layer with kernel size (3,3)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
# poolig layer with kernel size (2,2)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
 
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
 
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
 
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
# poolig layer with kernel size (2,1)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)
 
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
# Batch normalization layer
batch_norm_5 = BatchNormalization()(conv_5)
 
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)
 
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
 
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
 
# bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
 
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)
 
act_model = Model(inputs, outputs)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [12]:
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
 
 
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 
 
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
type(labels)

tensorflow.python.framework.ops.Tensor

In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
 
filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

In [0]:
training_img = np.array(training_img)
train_input_length = np.array(train_input_length)
train_label_length = np.array(train_label_length)

valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)

In [16]:
batch_size = 256
epochs = 10
model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], 
          y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epochs, 
          validation_data = ([valid_img, valid_padded_txt, valid_input_length, 
                              valid_label_length], [np.zeros(len(valid_img))]), 
          verbose = 1, callbacks = callbacks_list)

Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 24.23226, saving model to best_model.hdf5
Epoch 2/10
Epoch 00002: val_loss improved from 24.23226 to 12.94129, saving model to best_model.hdf5
Epoch 3/10
Epoch 00003: val_loss improved from 12.94129 to 12.00095, saving model to best_model.hdf5
Epoch 4/10
Epoch 00004: val_loss did not improve from 12.00095
Epoch 5/10
Epoch 00005: val_loss did not improve from 12.00095
Epoch 6/10
Epoch 00006: val_loss did not improve from 12.00095
Epoch 7/10
Epoch 00007: val_loss did not improve from 12.00095
Epoch 8/10
Epoch 00008: val_loss did not improve from 12.00095
Epoch 9/10
Epoch 00009: val_loss did not improve from 12.00095
Epoch 10/10
Epoch 00010: val_loss did not improve from 12.00095


<tensorflow.python.keras.callbacks.History at 0x7f72c11ea2b0>

In [18]:
# load the saved best model weights
act_model.load_weights('best_model.hdf5')

# predict outputs on validation images
prediction = act_model.predict(valid_img)
 
# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
 

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [19]:
# see the results
i = 0
count = 0
for x in out:
    #print(valid_orig_txt[i])
    r = []
    for p in x:  
        if int(p) != -1:
            #print(char_list[int(p)], end = '')   
            r.append(char_list[int(p)])    
    #print('\n')
    if valid_orig_txt[i] == ''.join(r):
      count += 1
    i+=1
print(count,'/',i,'=',count/i)

8660 / 15000 = 0.5773333333333334


In [25]:
scores = model.evaluate(valid_img, valid_txt, verbose=0)
print(scores[1]*100)

ValueError: ignored

In [21]:
valid_txt[0]

[30, 23, 15, 4, 13, 3]