In [22]:
import os
from PIL import Image
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [6]:
def resize(img):
    w, h = img.size
    BASE_WIDTH, BASE_HEIGHT = 128, 32

    def adjust_dim():
        if BASE_HEIGHT >= h * BASE_WIDTH / w:
            return BASE_WIDTH, h * BASE_WIDTH / w
        elif BASE_WIDTH >= w * BASE_HEIGHT / h:
            return w * BASE_HEIGHT / h, BASE_HEIGHT
        else:
            return BASE_WIDTH, BASE_HEIGHT

    new_dim = tuple(map(int, adjust_dim()))
    blank_img = Image.new('L', (BASE_WIDTH, BASE_HEIGHT), 255)
    img = img.resize(new_dim, Image.ANTIALIAS)
    assert(img.size[0] <= blank_img.size[0] and img.size[1] <= blank_img.size[1])
    blank_img.paste(img)
    return blank_img

def image_loader(filepath):
    X = []
    X_filename = []
    for dir, parent_file, files in os.walk(filepath):
        for filename in files:
            path = os.path.join(dir, filename)
            if filename[-3 : ] == 'png' and os.stat(path).st_size > 0:
                img = resize(Image.open(path).copy())
                X.append(np.array(img))
                X_filename.append(filename[:-4])
    return np.array(X), np.array(X_filename)

def text_loader(filepath):
    Y = {}
    file = open(filepath, 'r')
    for info in file:
        if info[0] == '#':
            pass
        info = info.split(" ")
        filename = info[0]
        word = info[-1]	
        Y[filename] = word
    return Y


def data_loader(image_filepath, text_filepath):
    X, X_filename = image_loader(image_filepath)
    Y_dict = text_loader(text_filepath)
    split = int(0.95 * len(X))
    
    X_train = X[:split]
    X_test = X[split:]
    X_filename_train = X_filename[:split]
    X_filename_test = X_filename[split:]
    
    Y_train = []
    Y_test = []
    
    for i in range(len(X_filename_train)):
        Y_train.append(Y_dict[X_filename_train[i]])
    for i in range (len(X_filename_test)):
        Y_test.append(Y_dict[X_filename_test[i]])
    
    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)
    
    return X_train, X_test, Y_train, Y_test

In [7]:
image_filepath = '/Users/prachigoyal/Desktop/htr/handwriting-recognition/words'
text_filepath = '/Users/prachigoyal/Desktop/htr/handwriting-recognition/words.txt'

In [9]:
X_train, X_test, Y_train, Y_test = data_loader(image_filepath, text_filepath)

In [28]:
tmp = pd.DataFrame(X_train[0])
print(tmp.shape)
print(X_train.shape)

(32, 128)
(109552, 32, 128)


In [55]:
def setupCNN(cnnIn):
        #Input size: Nc x 128 x 32 x 1
        cnnIn = tf.expand_dims(input=cnnIn, axis=3)
        print("layer 0: " + str(cnnIn.shape))
        cnnIn = tf.convert_to_tensor(cnnIn)
        
        #First Layer: Conv (5x5) + Pool(2X2), Output size : 128 x 32 x 32
        with tf.name_scope('Conv_Pool_1'):
            kernel = tf.Variable(tf.random.truncated_normal([5, 5, 1, 32], stddev=0.1))
            conv = tf.nn.conv2d(cnnIn, kernel, padding='SAME', strides=(1, 1, 1, 1))
            learelu = tf.nn.leaky_relu(conv, alpha=0.01)
            pool = tf.nn.max_pool(learelu, (1, 2, 2, 1), (1, 2, 2, 1), 'VALID')
            print("layer 1: " + str(pool.shape))
            
        #Second Layer: Conv (5x5) + Pool(2X2), Output size : 128 x 16 x 64
        with tf.name_scope('Conv_Pool_2'):
            kernel = tf.Variable(tf.random.truncated_normal([5, 5, 32, 64], stddev=0.1))
            conv = tf.nn.conv2d(pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
            learelu = tf.nn.leaky_relu(conv, alpha=0.01)
            pool = tf.nn.max_pool(learelu, (1, 2, 2, 1), (1, 2, 2, 1), 'VALID')
            print("layer 2: " + str(pool.shape))
            
        #Third Layer: Conv (5x5) + Pool(2X2), Output size : 128 x 8 x 128 or 16
        with tf.name_scope('Conv_Pool_2'):
            kernel = tf.Variable(tf.random.truncated_normal([5, 5, 64, 128], stddev=0.1))
            conv = tf.nn.conv2d(pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
            mean, variance = tf.nn.moments(conv, axes=[0])
            conv_norm = tf.nn.batch_normalization(conv, mean, variance, offset=None, scale=None, variance_epsilon=0.001)
            learelu = tf.nn.leaky_relu(conv_norm, alpha=0.01)
            pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
            print("layer 3: " + str(pool.shape))
            
        # Fourth Layer: Conv (3x3) - Output size: 
        with tf.name_scope('Conv_4'):
            kernel = tf.Variable(tf.random.truncated_normal([3, 3, 128, 128], stddev=0.1))
            conv = tf.nn.conv2d(pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
            learelu = tf.nn.leaky_relu(conv, alpha=0.01)
            pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
            print("layer 4: " + str(pool.shape))
        
        # Fifth Layer: Conv (3x3) + Pool(2x2) - Output size: 
        with tf.name_scope('Conv_Pool_5'):
            kernel = tf.Variable(tf.random.truncated_normal([3, 3, 128, 256], stddev=0.1))
            conv = tf.nn.conv2d(pool, kernel, padding='SAME', strides=(1, 1, 1, 1))
            learelu = tf.nn.leaky_relu(conv, alpha=0.01)
            pool = tf.nn.max_pool(learelu, (1, 1, 2, 1), (1, 1, 2, 1), 'VALID')
            print("layer 5: " + str(pool.shape))
        
        return pool

In [56]:
i = X_train[:10]
i = tf.convert_to_tensor(i, dtype = 'float32')
o = setupCNN(i)

layer 0: (10, 32, 128, 1)
layer 1: (10, 16, 64, 32)
layer 2: (10, 8, 32, 64)
layer 3: (10, 8, 16, 128)
layer 4: (10, 8, 8, 128)
layer 5: (10, 8, 4, 256)
