# Install dependencies

In [None]:
!pip install tensorflow-gpu==2.1
!pip install keras==2.3.0

# Connect google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive

# Import libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import cv2
from scipy import misc
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
import h5py

from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Embedding, TimeDistributed, \
Conv2D, MaxPooling2D, GlobalMaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image, ImageDraw, ImageFont
from keras.datasets import mnist

import os
from os import path
import datetime
import random
from random import shuffle, randint
import glob
import shutil

import pickle
import gzip

# Useful commands

In [None]:
%pwd

# Useful constants

In [13]:
working_dir = "working_dir/"

if not os.path.isdir(working_dir):
    os.mkdir(working_dir)

vocab = ['s', '0', '1', '2', '3', '4', '5', '6',
            '7', '8', '9', 'e']

ixtoword = {}
wordtoix = {}
ix = 0
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword)

# max length of the digit sequence
max_length = 6

# Useful functions

In [14]:
def visualize_data(images):
    for i in range(25):
        plt.subplot(5, 5, i+1)
        im = images[i+1]
        plt.imshow(im.reshape(25,50), cmap='gray')
        plt.axis('off')

def encode_sequence(img, string, wordtoix):
    images, in_sequences, out_sequences = list(), list(), list()
    # encode the sequence
    seq = [wordtoix[char] for char in string if char in wordtoix]
    seq.insert(0, 0)
    seq.append(11)
    # split one sequence into multiple X, y pairs
    for i in range(1, len(seq)):
    # split into input and output pair
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        images.append(np.asarray(img))
        in_sequences.append(in_seq)
        out_sequences.append(out_seq)
    return np.array(images), np.array(in_sequences), np.array(out_sequences)

def normalize_data(img_data):
    #Converting everything to floats
    img_data = img_data.astype('float32')
    #Normalizing values between 0 and 1
    img_data /= 255
    return img_data

# Load synthetic data using different fonts

In [None]:
!7za x /mydrive/Digits_ocr/fonts.7z -oworking_dir

In [None]:
random.seed(42)

SAMPLES = 100000
fonts_path = working_dir + "fonts"

def load_synthetic_data_from_fonts(samples, fonts_path):

    X1_synt, X2_synt, y_synt = list(), list(), list()
    fonts_files = os.listdir(fonts_path)
    fonts_files_num = len(fonts_files)

    for s in range(samples):
        img = None
        
        # random font
        font_file = random.randint(0, fonts_files_num-1)
        font_size = random.randint(10, 16)
        fnt = ImageFont.truetype('{}/{}'.format(fonts_path, fonts_files[font_file]), font_size)

        # number of digits in number
        digits = randint(1,int(50/font_size))

        # concatenate digits into number
        number = ""
        for j in range(digits):
            dgt = random.randint(0, 9)
            number += str(dgt)

        grayscale_value = random.randint(0, 30)
        img = Image.new('L', (50, 25), color = grayscale_value)

        # draw number
        start_x, start_y = random.randint(16-font_size, 18-font_size), random.randint(2, 4)
        d = ImageDraw.Draw(img)

        grayscale_value = random.randint(150, 255)
        d.text((start_x, start_y), number, font=fnt, fill=grayscale_value)

        img_array, in_seq, out_seq = encode_sequence(img, number, wordtoix)
        X1_synt.extend(img_array)
        X2_synt.extend(in_seq)
        y_synt.extend(out_seq)

    X1_synt = np.array(X1_synt)
    X2_synt = np.array(X2_synt)
    y_synt = np.array(y_synt)

    X1_synt = np.expand_dims(X1_synt, axis = 3)

    return X1_synt, X2_synt, y_synt

X1_synt_fonts, X2_synt_fonts, y_synt_fonts = load_synthetic_data_from_fonts(SAMPLES, fonts_path)
print(X1_synt_fonts.shape)
print(X2_synt_fonts.shape)
print(y_synt_fonts.shape)

# normalize the data
X1_synt_fonts = normalize_data(X1_synt_fonts)

# Visualize digits sequences generated
visualize_data(X1_synt_fonts)

# Load synthetic data using MNIST digits

In [None]:
random.seed(42)

def load_synthetic_data_from_mnist(samples):

    X1_synt, X2_synt, y_synt = list(), list(), list()

    # Load raw data from keras dataset
    (X_raw, y_raw), (X_raw_test, y_raw_test) = mnist.load_data()
    n_train, n_test = X_raw.shape[0], X_raw_test.shape[0]
    n_class, n_len, height, width = 11, 5, 25, 10

    X_len = X_raw.shape[0]

    for j in range(samples):
        # generate random numbers of digits
        img = np.zeros((height, width*n_len), dtype=np.uint8)
        n_digit = random.randint(1,5)
        number = ""
        shift_vert = random.randint(4, 12)
        for i in range(n_digit):
            index = random.randint(0, X_len-1)
            image = X_raw[index]
            image = image[3:28, 4:24]
            # resize image
            image = cv2.resize(image, (10,12))
            shift_hor = random.randint(0, 2) if i != 4 else 0
            img[shift_vert:shift_vert+12, 
                shift_hor+i*width:shift_hor+width+i*width] = image.copy()
            number += str(y_raw[index])

        img_array, in_seq, out_seq = encode_sequence(img, number, wordtoix)
        X1_synt.extend(img_array)
        X2_synt.extend(in_seq)
        y_synt.extend(out_seq)

    X1_synt = np.array(X1_synt)
    X2_synt = np.array(X2_synt)
    y_synt = np.array(y_synt)

    X1_synt = np.expand_dims(X1_synt, axis = 3)

    return X1_synt, X2_synt, y_synt

X1_synt_mnist, X2_synt_mnist, y_synt_mnist = load_synthetic_data_from_mnist(int(SAMPLES))
print(X1_synt_mnist.shape)
print(X2_synt_mnist.shape)
print(y_synt_mnist.shape)

# normalize the data
X1_synt_mnist = normalize_data(X1_synt_mnist)

# Visualize digits sequences generated
visualize_data(X1_synt_mnist)

# Load real, custom data

In [None]:
!7za x /mydrive/Digits_ocr/custom_digits_sequences.7z -oworking_dir

In [None]:
random.seed(42)

def load_real_data():

    X1_real, X2_real, y_real = list(), list(), list()
    training_folders = os.listdir(working_dir + 'custom_digits_sequences')

    for f in training_folders:
        training_files = os.listdir(working_dir + 'custom_digits_sequences/' + f)
        for image in training_files:
            path = working_dir + 'custom_digits_sequences/' + f + '/' + image
            im = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            img_array, in_seq, out_seq = encode_sequence(im, f, wordtoix)
            X1_real.extend(img_array)
            X2_real.extend(in_seq)
            y_real.extend(out_seq)

    X1_real = np.array(X1_real)
    X2_real = np.array(X2_real)
    y_real = np.array(y_real)

    X1_real = np.expand_dims(X1_real, axis = 3)

    return X1_real, X2_real, y_real

X1_real, X2_real, y_real = load_real_data()
print(X1_real.shape)
print(X2_real.shape)
print(y_real.shape)

# normalize the data
X1_real = normalize_data(X1_real)

# Visualize digits sequences generated
visualize_data(X1_real)

# Model building

In [None]:
def get_model():
    # input image
    inputs_img = tf.keras.Input(shape=(25, 50, 1))

    # convolution model
    x = Conv2D(32, kernel_size=(3, 3), strides=2, 
            padding='valid', activation='relu')(inputs_img)
    x = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), 
                    padding='valid', data_format=None)(x)

    x = Conv2D(64, kernel_size=(3, 3), strides=1, 
            padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), 
                    padding='valid', data_format=None)(x)

    x = Conv2D(128, kernel_size=(3, 3), strides=1, 
            padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), 
                    padding='valid', data_format=None)(x)

    # fully connected
    x = Flatten()(x)
    fe1 = Dropout(0.25)(x)
    fe2 = Dense(128, activation='relu')(fe1)

    # partial caption sequence model
    vocab_size = 12
    max_length = 6
    embedding_dim = 50

    inputs_seq = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim)(inputs_seq) #, mask_zero=True
    mask = tf.keras.layers.Masking(mask_value=0)(se1)
    se2 = Dropout(0.5)(mask)
    se3 = LSTM(64)(se2)

    # decoder (feed forward) model
    decoder1 = tf.keras.layers.concatenate([fe2, se3])
    decoder2 = Dense(64, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # merge the two input models
    model = Model(inputs=[inputs_img, inputs_seq], outputs=outputs)

    opt = tf.keras.optimizers.Adam(learning_rate=0.000001)
    model.compile(loss='categorical_crossentropy', optimizer='adam', 
                metrics=['accuracy'])
    
    return model

model = get_model()
model.summary()

# Visualize the network model using graphviz

In [None]:
from tensorflow.keras.utils import plot_model, model_to_dot
from IPython.display import Image, SVG
plot_model(model, to_file='model.png', show_shapes=True)
Image('model.png')

# Setup TensorBoard before training

In [None]:
%load_ext tensorboard
%tensorboard --logdir working_dir/logs/fit

# Train

In [None]:
random.seed(42)

# Load the data and print its shape
X1 = np.concatenate((X1_synt_fonts, X1_synt_mnist, X1_real), axis=0) #, X1_real
X2 = np.concatenate((X2_synt_fonts, X2_synt_mnist, X2_real), axis=0) #, X2_real
y = np.concatenate((y_synt_fonts, y_synt_mnist, y_real), axis=0) #, y_real
print(X1.shape)
print(X2.shape)
print(y.shape)

# Load existing weights
model.load_weights('/mydrive/Digits_ocr/model4.h5')

# TensorBoard settings
log_dir = working_dir + "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0)

# Train the neural network.
model.fit(
		x=[X1, X2], y=y,
        epochs=20,
		shuffle=True,
        validation_split=0.2,
		#validation_data=([X1_real, X2_real], y_real),
		#callbacks=[tensorboard_callback]
          )

# Save the model

In [24]:
model.save_weights('/mydrive/Digits_ocr/model5.h5')

# Predict

In [None]:
def greedySearch(photo):
    in_text = 's'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += '' + word
        if word == 'e':
            break
    final = in_text
    final = final[1:-1]
    final = ' '.join(final)
    return final

# check on synthetic data
image = X1_synt_fonts[100]
plt.imshow(image.reshape(25,50), cmap='gray')
plt.show()

# prepare for the inference
image = image.reshape(-1, 25, 50, 1)

if image is not None:
    print("Greedy:", greedySearch(image))
else:
    print('Image is None')