# **Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import glob
import multiprocessing
import shutil
import random
import torch
import torch.nn as nn
import torch.optim as optim
import json
import copy
import gc
import tensorflow.keras.backend as K


from tqdm import tqdm
from torch.utils.data import Dataset
from scipy import signal
from scipy.io import wavfile
from PIL import Image, ImageDraw2
from torch.utils.data import DataLoader
from functools import partial
from typing import List
from collections import defaultdict
from scipy.special import logsumexp  # log(p1 + p2) = logsumexp([log_p1, log_p2])
from torch.nn import CTCLoss
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional, Add, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu, sigmoid, softmax
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import CSVLogger, TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

!pip install ipython-autotime
%load_ext autotime

time: 1.04 ms (started: 2023-10-15 11:13:41 +00:00)


In [3]:
src_path = '/content/drive/MyDrive/kalapa2023/OCR'
%cd $src_path

/content/drive/MyDrive/kalapa2023/OCR
time: 2.99 ms (started: 2023-10-15 11:13:41 +00:00)


# **Utils**

## **Config**

In [4]:
config = {
    'DATA_ROOT': src_path,
    'TRAIN_PATH': src_path + '/training_data',
    'VAL_SIZE': 0.2,
    'RANDOM_STATE': 42,
    'TIME_STEPS': 207,

    'EPOCHS': 100,
    'BATCH_SIZE': 32,
    # 'IMG_HEIGHT': 1600,
    # 'IMG_WIDTH': 96,
    # 'MAP_TO_SEQ_HIDDEN': 64,
    # 'RNN_HIDDEN': 256,
    # 'LEAKY_RELU': False,
    # 'TRAIN_BATCH_SIZE': 2,
    # 'EVAL_BATCH_SIZE': 2,
    # 'CPU_WORKERS': 2
}

time: 587 µs (started: 2023-10-15 11:13:41 +00:00)


## **Function**

In [5]:
"""Draw folder tree structure"""
def list_folders(path, level=1):
    if level == 4:
        return
    elif level == 1:
        print('\t' * (level-1) + '|-- ' + path)
        list_folders(path, level+1)
    else:
        try:
            for item in os.listdir(path):
                item_path = os.path.join(path, item)
                print('\t' * (level-1) + '|-- ' + item)
                os.path.isdir(item_path) and list_folders(item_path, level+1)
        except PermissionError:
            pass

"""Get min, max of height, width images in folder"""
def read_image_dimensions(image_path):
    img = cv2.imread(image_path)
    if img is not None:
        h, w, _ = img.shape
        return h, w
    return None

def min_max_height_width(images_path: List[str]):

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    dimensions = pool.map(read_image_dimensions, images_path)
    pool.close()
    pool.join()

    dimensions = [dim for dim in dimensions if dim is not None]

    min_height = min(dimensions, key=lambda x: x[0])
    max_height = max(dimensions, key=lambda x: x[0])
    min_width = min(dimensions, key=lambda x: x[1])
    max_width = max(dimensions, key=lambda x: x[1])

    total_height = sum(h for h, _ in dimensions)
    total_width = sum(w for _, w in dimensions)
    average_height = total_height / len(dimensions)
    average_width = total_width / len(dimensions)

    return min_height, max_height, min_width, max_width, total_height, total_width, average_height, average_width

"""Split train/val data"""
def split_and_save_train_val_data(data_dir, output_dir, val_split=0.2, random_seed=None):
    dirs = {
        'train': {
            'images': [],
            'annotations': []},
        'val': {
            'images': [],
            'annotations': []}}

    for t in ['train', 'val']:
        for d in ['images', 'annotations']:
            dirs[t][d] = os.path.join(output_dir, t, d)
            os.makedirs(dirs[t][d], exist_ok=True)

    annotations_dir = os.path.join(data_dir, 'annotations')
    annotations_files = [f for f in os.listdir(annotations_dir) if f.endswith('.txt')]
    writer_id_to_images = {}

    for annotation_file in annotations_files:
        writer_id = annotation_file[:-4]
        writer_id_to_images[writer_id] = []

        with open(os.path.join(annotations_dir, annotation_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                image_name, label = line.strip().split('\t')
                writer_id_to_images[writer_id].append((image_name, label))

    writer_ids = list(writer_id_to_images.keys())
    random.seed(random_seed)
    random.shuffle(writer_ids)
    num_val_writers = int(val_split * len(writer_ids))
    val_writers = writer_ids[:num_val_writers]
    train_writers = writer_ids[num_val_writers:]

    for t, writers in [('train', train_writers), ('val', val_writers)]:
        for writer_id in writers:
            writer_images_dir = os.path.join(dirs[t]['images'], writer_id)
            writer_annotations_file = os.path.join(dirs[t]['annotations'], f"{writer_id}.txt")
            os.makedirs(writer_images_dir, exist_ok=True)

            with open(writer_annotations_file, 'w') as f:
                for image_name, label in writer_id_to_images[writer_id]:
                    image_src = os.path.join(data_dir, 'images', image_name)
                    shutil.copy(image_src, writer_images_dir)
                    f.write(f"{image_name}\t{label}\n")

time: 3.92 ms (started: 2023-10-15 11:13:41 +00:00)


# **Train/Val split**

In [6]:
if not os.path.exists(config['DATA_ROOT'] + '/training_split_data'):
    data_dir = config['DATA_ROOT'] + '/training_data'
    output_dir = config['DATA_ROOT'] + '/training_split_data'
    split_and_save_train_val_data(data_dir, output_dir)
    print("Splitting data successfully !!! \u2705")
else:
    print(f"The {config['DATA_ROOT'] + '/training_split_data'} path is existed !!! \u2705")

The /content/drive/MyDrive/kalapa2023/OCR/training_split_data path is existed !!! ✅
time: 2.26 ms (started: 2023-10-15 11:13:41 +00:00)


# **Load Data**

In [7]:
    def load_data(root_dir):
        paths_dir=''
        paths, texts = [], []

        image_dir = os.path.join(root_dir, paths_dir, 'images')
        annotation_dir = os.path.join(root_dir, paths_dir, 'annotations')

        for id_dir in os.listdir(image_dir):
            id_annotation_file = os.path.join(annotation_dir, id_dir + '.txt')

            if not os.path.exists(id_annotation_file):
                continue

            with open(id_annotation_file) as f:
                annotation_lines = f.readlines()

            for img_file in os.listdir(os.path.join(image_dir, id_dir)):
                img_path = os.path.join(image_dir, id_dir, img_file)
                img_id = id_dir + '/' + img_file

                for line in annotation_lines:
                    line_parts = line.strip().split('\t')
                    if line_parts[0] == img_id:
                        texts.append(line_parts[1])
                        paths.append(img_path)
                        break

        return paths, texts

paths, labels = load_data(config['TRAIN_PATH'])
paths = paths[:1000]
labels = labels[:1000]
assert len(paths) == len(labels)

time: 259 ms (started: 2023-10-15 11:13:41 +00:00)


In [8]:
train_labels = {path: label for path, label in zip(paths, labels)}
assert len(train_labels) == len(paths)
assert isinstance(train_labels, dict)

time: 1.15 ms (started: 2023-10-15 11:13:42 +00:00)


In [9]:
char_list = sorted({char for label in train_labels.values() for char in label})
CHARS = "".join(char_list)
print(f"\u2705 The number of characters in labels: {len(CHARS)}")
print(f"\u2705 Show all possibles labels characters: {CHARS}")

✅ The number of characters in labels: 132
✅ Show all possibles labels characters:  '/123456789ABCDEFGHIJKLMNOPQRSTUVXYabcdefghiklmnopqrstuvxyÁÂÓÔÝàáâãèéêìíòóôõùúýăĐđĩũơƯưạảẤấầẩẫậắằẳẵặẹẻếềểễệịọỏốồỔổỗộớờởợụủứừửữựỳỵỷỹ
time: 3.55 ms (started: 2023-10-15 11:13:42 +00:00)


# **Data Processing**

## **char2label**

In [10]:
def encode_char2label(char_list: list, text: str):
    text_encoded = list()
    for char in text:
        try:
            text_encoded.append(char_list.index(char))
        except:
            print(f"No found '{char}' in char_list")
    return text_encoded

"""Test function"""
text_encoded = encode_char2label(char_list, "Nghị lực và kiên trì!")
text_encoded

No found '!' in char_list


[25, 42, 43, 108, 0, 46, 127, 38, 0, 56, 64, 0, 45, 44, 70, 48, 0, 54, 52, 71]

time: 12.8 ms (started: 2023-10-15 11:13:42 +00:00)


## **dict `path: label`**

In [11]:
dict_filepath_label = copy.deepcopy(train_labels)
dict_filepath_label

{'/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/2.jpg': 'Ấp An Phú B Long An Long Hồ Vĩnh Long',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/18.jpg': 'Tổ 8 Ấp Kiến Sơn Kiên Bình Kiên Lương Kiên Giang',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/26.jpg': 'Thôn Lạc Sơn Lạc Lâm Đơn Dương Lâm Đồng',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/12.jpg': 'Hòn Tre Kiên Hải Kiên Giang',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/9.jpg': 'Thôn La Vân 7 Quỳnh Hồng Quỳnh Phụ Thái Bình',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/1.jpg': 'Lam Cốt Tân Yên Bắc Giang',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/16.jpg': 'Khu Phố 7 Phường 2 Thị Xã Cai Lậy Tiền Giang',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/15.jpg': 'X Thuận Thành H Phổ Yên T Thái Nguyên',
 '/content/drive/MyDrive/kalapa2023/OCR/training_data/images/61/5.jpg': 'Cảm Ân Yên Bì

time: 44 ms (started: 2023-10-15 11:13:42 +00:00)


## **maximum label length**

In [12]:
max_length = max(len(text) for text in dict_filepath_label.values())
min_length = min(len(text) for text in dict_filepath_label.values())
max_length_key = next(key for key, text in dict_filepath_label.items() if len(text) == max_length)
min_length_key = next(key for key, text in dict_filepath_label.items() if len(text) == min_length)

print(f"\u2705 The maximum text length is: `{dict_filepath_label[max_length_key]}` with {max_length} characters")
print(f"\u2705 The minimum text length is: `{dict_filepath_label[min_length_key]}` with {min_length} characters")

✅ The maximum text length is: `Ấp Cầu Ngang Long Hữu Đông Cần Đước Long An Phan Hoai Nhan Falsc Jpq` with 68 characters
✅ The minimum text length is: `Tân Độ` with 6 characters
time: 4.48 ms (started: 2023-10-15 11:13:42 +00:00)


## **`min`, `max` of height, width**

In [13]:
# min_height, max_height, min_width, max_width, total_height, total_width, average_height, average_width = min_max_height_width(paths)

# print(f"\u2705 Min Height: {min_height[0]} \
#         \n\u2705 Max Height: {max_height[0]} \
#         \n\u2705 Min Width: {min_width[1]} \
#         \n\u2705 Max Width: {max_width[1]} \
#         \n\u2705 Total Height: {total_height} \
#         \n\u2705 Total Width: {total_width} \
#         \n\u2705 Average Height: {average_height} \
#         \n\u2705 Average Width: {average_width}")

time: 282 µs (started: 2023-10-15 11:13:42 +00:00)


## **train/val split**

In [14]:
train_image_paths, val_image_paths = train_test_split(
    paths,
    test_size=config['VAL_SIZE'],
    random_state=config['RANDOM_STATE'])

assert len(train_image_paths) + len(val_image_paths) == len(paths)

time: 9.06 ms (started: 2023-10-15 11:13:42 +00:00)


## **preprocess images**

In [17]:
def images_preprocessing(image_paths, mode=None, height=None, width=None, resize_max_width=0):
    set_img = []
    set_txt = []
    set_input_length = []
    set_label_length = []
    orig_txt = []

    i = 0

    for image_path in image_paths:

        # read input image and convert into gray scale image
        img = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2GRAY)

        if mode == 'train':
            height, width = img.shape

        # in this dataset, we don't need to do any resize at all here.
        img = cv2.resize(img,(int(118/height*width),118))

        if mode == 'train':
            height, width = img.shape

        if img.shape[1] > resize_max_width:
            resize_max_width = img.shape[1]

        img = np.pad(img, ((0,0),(0, 2065-width)), 'median')

        # YOUR PART: Blur it
        img = cv2.GaussianBlur(img, (5,5), 0)

        # YOUR PART: Threshold the image using adapative threshold
        img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 4)

        # add channel dimension
        img = np.expand_dims(img , axis = 2)

        # Normalize each image
        img = img/255.

        label = dict_filepath_label[image_path]

        orig_txt.append(label)

        set_label_length.append(len(label))

        # our time steps for valid input
        set_input_length.append(config['TIME_STEPS'])
        set_img.append(img)

        # convert words to digits based on charlist
        set_txt.append(encode_char2label(char_list, label))
        i += 1
        if i % 500 == 0:
            print (f"\u2705 Has processed {mode} {i} files")

    return (set_img,
            set_txt,
            set_input_length,
            set_label_length,
            orig_txt,
            height,
            width,
            resize_max_width)

time: 2.42 ms (started: 2023-10-15 11:14:40 +00:00)


In [18]:
(training_img,
 training_txt,
 train_input_length,
 train_label_length,
 valid_orig_txt,
 height,
 width,
 resize_max_width) = images_preprocessing(image_paths=train_image_paths,
                                          mode='train')

print(resize_max_width)

✅ Has processed train 500 files
2065
time: 11.6 s (started: 2023-10-15 11:14:46 +00:00)


In [19]:
# for i in range(5):
#     plt.figure(figsize=(15, 2))
#     plt.imshow(training_img[i][:,:,0], cmap="gray")
#     plt.show()

time: 317 µs (started: 2023-10-15 11:15:01 +00:00)


In [20]:
(valid_img,
 valid_txt,
 valid_input_length,
 valid_label_length,
 orig_txt,
 height,
 width,
 resize_max_width) = images_preprocessing(image_paths=val_image_paths,
                                          mode='val',
                                          height=height,
                                          width=width,
                                          resize_max_width=resize_max_width)
print(resize_max_width)

2065
time: 2.84 s (started: 2023-10-15 11:15:02 +00:00)


In [21]:
# for i in range(5):
#     plt.figure(figsize=(15,2))
#     plt.imshow(valid_img[i][:,:,0], cmap="gray")
#     plt.show()

time: 226 µs (started: 2023-10-15 11:15:05 +00:00)


## **padding label length**

In [22]:
max_label_len = config['TIME_STEPS']
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value=0)
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value=0)

train_padded_txt[0], valid_padded_txt[0]

(array([ 31,  43, 108,   0,  31,  52,  91,  48,   0,  25, 103,  48,  43,
          0,  33,  44, 107,  54,   0,  35,  70,  48,   0,  13,  96,  38,
          0,  18,  44,  36,  48,  42,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

time: 11.3 ms (started: 2023-10-15 11:15:06 +00:00)


# **Model**

## **Building CRNN and LSTM**

In [23]:
inputs = Input(shape=(118, resize_max_width, 1))

# Block 1
x = Conv2D(64, (3,3), padding='same')(inputs)
x = MaxPool2D(pool_size=3, strides=3)(x)
x = Activation('relu')(x)
x_1 = x

# Block 2
x = Conv2D(128, (3,3), padding='same')(x)
x = MaxPool2D(pool_size=3, strides=3)(x)
x = Activation('relu')(x)
x_2 = x

# Block 3
x = Conv2D(256, (3,3), padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x_3 = x

# Block4
x = Conv2D(256, (3,3), padding='same')(x)
x = BatchNormalization()(x)
x = Add()([x,x_3])
x = Activation('relu')(x)
x_4 = x

# Block5
x = Conv2D(512, (3,3), padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x_5 = x

# Block6
x = Conv2D(512, (3,3), padding='same')(x)
x = BatchNormalization()(x)
x = Add()([x,x_5])
x = Activation('relu')(x)

# Block7
x = Conv2D(1024, (3,3), padding='same')(x)
x = BatchNormalization()(x)
x = MaxPool2D(pool_size=(3, 1))(x)
x = Activation('relu')(x)

# pooling layer with kernel size (2,2) to make the height/2 #(1,9,512)
x = MaxPool2D(pool_size=(3, 1))(x)

# # to remove the first dimension of one: (1, 31, 512) to (31, 512)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(x)

# # # bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(512, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(512, return_sequences=True, dropout = 0.2))(blstm_1)

# # this is our softmax character proprobility with timesteps
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

# model to be used at test time
act_model = Model(inputs, outputs)

time: 8.6 s (started: 2023-10-15 11:15:10 +00:00)


In [24]:
act_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 118, 2065, 1)]       0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 118, 2065, 64)        640       ['input_1[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 39, 688, 64)          0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 activation (Activation)     (None, 39, 688, 64)          0         ['max_pooling2d[0][0]']   

## **CTC**

In [25]:
# define the label input shape for ctc
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')

# define the length of input and label for ctc
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# define a ctc lambda function to take arguments and return ctc_bach_cost
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

# out loss function (just take the inputs and put it in our ctc_batch_cost)
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

#model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

time: 155 ms (started: 2023-10-15 11:15:28 +00:00)


In [26]:
# ready ctc loss function and optimizers
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

# our callbacks hell to optimize our learning
callbacks = [
    TensorBoard(
        log_dir='./logs',
        histogram_freq=10,
        profile_batch=0,
        write_graph=True,
        write_images=False,
        update_freq="epoch"),
    ModelCheckpoint(
        filepath=os.path.join('checkpoint_weights.hdf5'),
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=True,
        verbose=1),
    EarlyStopping(
        monitor='val_loss',
        min_delta=1e-8,
        patience=20,
        restore_best_weights=True,
        verbose=1),
    ReduceLROnPlateau(
        monitor='val_loss',
        min_delta=1e-8,
        factor=0.2,
        patience=10,
        verbose=1)
]
callbacks_list = callbacks

time: 29.6 ms (started: 2023-10-15 11:15:30 +00:00)


In [None]:
model.summary()

## **Training**

In [27]:
# ready our training data
training_img = np.array(training_img)
train_input_length = np.array(train_input_length)  # all must be equal length to T timesteps
train_label_length = np.array(train_label_length)  # different length (only the same in Captcha dataset)

# ready our validating data
valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length) # all must be equal length to T timesteps
valid_label_length = np.array(valid_label_length) # different length (only the same in Captcha dataset)

time: 856 ms (started: 2023-10-15 11:15:33 +00:00)


In [28]:
history = model.fit(
    x = [training_img,                    # data train
         train_padded_txt,
         train_input_length,
         train_label_length],
    y = np.zeros(len(training_img)),      # label

    batch_size = config['BATCH_SIZE'],    # batch size
    epochs = config['EPOCHS'],            # epochs

    validation_data = ([valid_img,        # data val
                        valid_padded_txt,
                        valid_input_length,
                        valid_label_length],
                        [np.zeros(len(valid_img))]),
    verbose = 1,
    callbacks = callbacks_list)

Epoch 1/100
Epoch 1: val_loss improved from inf to 167.02979, saving model to checkpoint_weights.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 167.02979 to 163.56577, saving model to checkpoint_weights.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 163.56577 to 161.11632, saving model to checkpoint_weights.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 161.11632 to 141.58365, saving model to checkpoint_weights.hdf5
Epoch 5/100


KeyboardInterrupt: ignored

time: 3min 25s (started: 2023-10-15 11:15:36 +00:00)
