## За аналог взят

[Распознавание рукописного ввода с использованием CRNN в Керасе](https://www.kaggle.com/samfc10/handwriting-recognition-using-crnn-in-keras)

In [1]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Lambda, Activation, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
import sys
sys.path.append('..')

from scr import train
data_path_train = '../data/dataset/dataset/training_data'
data_path_test = '../data/dataset/dataset/testing_data'
images_path_train = '../data/dataset/dataset/training_data/images'
images_path_test = '../data/dataset/dataset/testing_data/images'

In [5]:
def open_anatac(path):
    df = train.Train().get_annotation(path, True)
    df = df.drop_duplicates('fil_name')
    df = df.sort_values('fil_name', ignore_index = True)
    return df

In [19]:
train_df = open_anatac(data_path_train)
train_size = train_df.shape[0]
print(train_size)
train_df.head()

125


  return (df.label == 'other')&(df.text.str.contains('(^\d{5,10})|(\d{5,10}$)'))


Unnamed: 0,box,text,label,words,linking,id,fil_name
0,"[525, 904, 641, 926]",597005708,other,"[{'box': [525, 904, 641, 926], 'text': '597005...",[],8,971160
1,"[533, 829, 653, 853]",620419245,other,"[{'box': [533, 829, 653, 853], 'text': '620419...",[],29,989556
2,"[501, 839, 623, 871]",620429480,other,"[{'box': [501, 839, 623, 871], 'text': '620429...",[],10,990274
3,"[550, 883, 675, 904]",620915734,other,"[{'box': [550, 883, 675, 904], 'text': '620915...",[],119,999294
4,"[697, 796, 721, 927]",621800455,other,"[{'box': [697, 796, 721, 927], 'text': '621800...",[],3,1118259


In [20]:
test_df = open_anatac(data_path_test)
test_size = test_df.shape[0]
print(test_size)
test_df.head()

49


  return (df.label == 'other')&(df.text.str.contains('(^\d{5,10})|(\d{5,10}$)'))


Unnamed: 0,box,text,label,words,linking,id,fil_name
0,"[633, 775, 653, 874]",82092117,other,"[{'box': [633, 775, 653, 874], 'text': '820921...",[],6,82092117
1,"[675, 774, 696, 879]",82200067,other,"[{'box': [675, 774, 696, 879], 'text': '822000...",[],22,82200067_0069
2,"[680, 814, 698, 917]",82250337,other,"[{'box': [680, 814, 698, 917], 'text': '822503...",[],11,82250337_0338
3,"[680, 832, 702, 941]",82251504,other,"[{'box': [680, 832, 702, 941], 'text': '822515...",[],10,82251504
4,"[695, 787, 719, 895]",82252956,other,"[{'box': [695, 787, 719, 895], 'text': '822529...",[],12,82252956_2958


## Уникальные символы которые мы хотим распознать

In [10]:
sym_list = []

for text in  train_df.text:
    for word in text:
        word = word.upper()
        if word not in sym_list: 
            sym_list.append(word)
sym_list.sort()
alphabets = ''.join(sym_list)
print(sym_list)
alphabets

[' ', '&', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'X', 'Y']


' &,.0123456789ABCDEFGIJKMNOPRSTUWXY'

## Максимальная длинна строки

In [29]:
max_str_len = train_df.text.str.len().max()
max_str_len

38

## Предварительная обработка и подготовка изображений к обучению

* Изображения загружаются в оттенках серого и меняют форму на ширину 256 и высоту 64.
* Ширина и высота обрезаются, если они больше 256 и 64 соответственно.  
* Если они меньше, то изображение дополняется белыми пикселями.  
* Наконец, изображение поворачивается по часовой стрелке, чтобы привести форму изображения к (x, y).
* Затем изображение нормализуется до диапазона [0, 1].

In [62]:
def preprocess(img):
#     h_max, w_max = 1000, 754
    h_max, w_max = 256, 64
    
    (h, w) = img.shape
    
    final_img = np.ones([h_max, w_max])*255 # blank white image
    
    # crop
    if w > w_max:
        img = img[:, :w_max]
        
    if h > h_max:
        img = img[:h_max, :]
    
    
    final_img[:h, :w] = img
    return cv2.rotate(final_img, cv2.ROTATE_90_CLOCKWISE)

In [63]:
def lod_imag(df, images_path):
    data = []
    targets = []
    filenames = []

    for name in df.fil_name.unique():
        filename = name + '.png'
        image_path = f'{images_path}/{filename}'
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = preprocess(image)
        image = image/255

        data.append(image)
        targets.append(train_df.loc[train_df.fil_name == name, 'text'].max())
        filenames = [filename]
    return data, targets, filenames

In [64]:
train_x, train_targets, train_filenames = lod_imag(train_df, images_path_train)
test_x, test_targets, test_filenames = lod_imag(test_df, images_path_test)

In [65]:
train_x = np.array(train_x).reshape(-1, 256, 64, 1)
test_x = np.array(test_x).reshape(-1, 256, 64, 1)

## Подготовка этикеток для CTC Loss¶

 Узнайте больше о потере CTC и о том, почему это потрясающе для распознавания текста [здесь](https://theailearner.com/2019/05/29/connectionist-temporal-classificationctc/).

 Метки должны быть преобразованы в числа, которые представляют каждый символ в обучающем наборе.  «Алфавиты» состоят из AZ и трех специальных символов (- и пробела).

In [66]:
alphabets = alphabets #Уникальные символы которые мы хотим распознать
max_str_len = max_str_len # Максимальная длинна строки
num_of_characters = len(alphabets) + 1 # +1 для псевдо-пустого ctc
num_of_timestamps = 64 # максимальная длина прогнозируемых меток

def label_to_num(label):
    label_num = []
    for ch in label:
        label_num.append(alphabets.find(ch))
        
    return np.array(label_num)

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  # CTC Blank
            break
        else:
            ret+=alphabets[ch]
    return ret

In [67]:
text = '93329540'
print(text, '\n',label_to_num(text))

93329540 
 [13  7  7  6 13  9  8  4]


* train_y содержит истинные метки, преобразованные в числа и дополненные -1.  Длина каждой метки равна max_str_len.
* train_label_len содержит длину каждой истинной метки (без заполнения)
* train_input_len содержит длину каждой прогнозируемой метки.  Длина всех прогнозируемых меток постоянна, т.е. количество временных меток - 2.
* train_output фиктивный вывод для потери ctc.

In [68]:
def get_labls(df, size, max_str_len):
    y = np.ones([size, max_str_len]) * -1
    label_len = np.zeros([size, 1])
    input_len = np.ones([size, 1]) * (num_of_timestamps-2)
    output = np.zeros([size])

    for i in range(size):
        label_len[i] = len(df.loc[i, 'text'])
        y[i, 0:len(df.loc[i, 'text'])] = label_to_num(df.loc[i, 'text'])
    return y, label_len, input_len, output

In [69]:
train_y, train_label_len, train_input_len, train_output = get_labls(train_df, train_size, max_str_len = max_str_len)
test_y, test_label_len, test_input_len, test_output = get_labls(test_df, test_size, max_str_len = max_str_len)

In [70]:
input_data = Input(shape=(256, 64, 1), name='input')

inner = Conv2D(32, (3, 3), padding='same', name='conv1', kernel_initializer='he_normal')(input_data)  
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(2, 2), name='max1')(inner)

inner = Conv2D(64, (3, 3), padding='same', name='conv2', kernel_initializer='he_normal')(inner)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(2, 2), name='max2')(inner)
inner = Dropout(0.3)(inner)

inner = Conv2D(128, (3, 3), padding='same', name='conv3', kernel_initializer='he_normal')(inner)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max3')(inner)
inner = Dropout(0.3)(inner)

# CNN to RNN
inner = Reshape(target_shape=((64, 1024)), name='reshape')(inner)
inner = Dense(64, activation='relu', kernel_initializer='he_normal', name='dense1')(inner)

## RNN
inner = Bidirectional(LSTM(256, return_sequences=True), name = 'lstm1')(inner)
inner = Bidirectional(LSTM(256, return_sequences=True), name = 'lstm2')(inner)

## OUTPUT
inner = Dense(num_of_characters, kernel_initializer='he_normal',name='dense2')(inner)
y_pred = Activation('softmax', name='softmax')(inner)

model = Model(inputs=input_data, outputs=y_pred)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 256, 64, 1)]      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 256, 64, 32)       320       
_________________________________________________________________
batch_normalization_12 (Batc (None, 256, 64, 32)       128       
_________________________________________________________________
activation_12 (Activation)   (None, 256, 64, 32)       0         
_________________________________________________________________
max1 (MaxPooling2D)          (None, 128, 32, 32)       0         
_________________________________________________________________
conv2 (Conv2D)               (None, 128, 32, 64)       18496     
_________________________________________________________________
batch_normalization_13 (Batc (None, 128, 32, 64)       256 

In [71]:
# the ctc loss function
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [72]:
labels = Input(name='gtruth_labels', shape=[max_str_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

ctc_loss = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
model_final = Model(inputs=[input_data, labels, input_length, label_length], outputs=ctc_loss)

In [73]:
# the loss calculation occurs elsewhere, so we use a dummy lambda function for the loss
model_final.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=Adam(lr = 0.0001))

model_final.fit(x=[train_x, train_y, train_input_len, train_label_len], y=train_output, 
                validation_data=([test_x, test_y, test_input_len, test_label_len], test_output),
                epochs=5, batch_size=128)

Epoch 1/5


InvalidArgumentError:  All labels must be nonnegative integers, batch: 69 labels: 25,-1,-1,0,34,-1,-1,-1,2,0,25,-1,-1,0,34,-1,-1,-1,0,5,4,4,5,11
	 [[node model_4/ctc/CTCLoss (defined at C:\Users\uriks\AppData\Local\Temp/ipykernel_2660/496030596.py:7) ]] [Op:__inference_train_function_17276]

Function call stack:
train_function


In [61]:
# for list_ in [train_x, train_y, train_input_len, train_label_len]:
for list_ in [test_x, test_y, test_input_len, test_label_len]:
    print(len(list_))

49
49
49
49
