# Глава 1. Предварительная обработка и аннотирование

In [217]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, LSTM, TimeDistributed, Embedding, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from lxml import etree
from sklearn.model_selection import train_test_split
import pickle
from datetime import datetime
import xml.etree.ElementTree as ET
from pylatexenc.latexwalker import LatexWalker, LatexMacroNode, LatexCharsNode, LatexEnvironmentNode
from keras.preprocessing.sequence import pad_sequences

In [218]:
# Укажите желаемый размер изображений
desired_width = 256
desired_height = 256

In [219]:
# Функция для загрузки и предобработки изображений
def load_and_preprocess_images(folder, width, height):
    def preprocess_image(img, width, height):
        # Сглаживание изображения
        img = cv2.GaussianBlur(img, (5, 5), 0)
        # Адаптивное пороговое значение
        img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
        # Изменение размера изображения
        img = cv2.resize(img, (width, height))
        return img

    images = {}
    for filename in os.listdir(folder):
        if filename.endswith('.png'):
            img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                img = preprocess_image(img, width, height)
                images[os.path.splitext(filename)[0]] = img
            else:
                print(f"Failed to read image: {filename}")
    return images

In [220]:
# Функция для парсинга InkML файлов и извлечения LaTeX-аннотаций
def parse_inkml(file_path):
    try:
        tree = etree.parse(file_path, etree.XMLParser(recover=True))
        root = tree.getroot()
        namespace = {'ink': 'http://www.w3.org/2003/InkML'}
        
        # Извлечение всех аннотаций LaTeX
        latex_expressions = []
        for annotation in root.findall('.//ink:annotation', namespaces=namespace):
            if annotation.attrib.get('type') == 'truth':
                latex_expressions.append(annotation.text)
        if not latex_expressions:
            print(f"No LaTeX annotations found in {file_path}")
        return latex_expressions
    except etree.XMLSyntaxError as e:
        print(f"Error parsing {file_path}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error parsing {file_path}: {e}")
        return None

In [221]:
# Функция для загрузки аннотаций из всех InkML файлов в папке
def load_annotations(folder):
    annotations = {}
    for filename in os.listdir(folder):
        if filename.endswith('.inkml'):
            file_path = os.path.join(folder, filename)
            latex_expressions = parse_inkml(file_path)
            if latex_expressions:
                annotations[os.path.splitext(filename)[0]] = latex_expressions[0]  # Assuming only one LaTeX annotation per file
    return annotations

In [222]:
# Пути к папкам с изображениями и аннотациями
images_folder_train = './data_2019/1_images_train'
annotations_folder_train = './data_2019/2_annotation_train'
images_folder_test = './data_2019/3_images_test'
annotations_folder_test = './data_2019/4_annotation_test'

# Загрузка изображений и аннотаций для тренировочных данных
images_train = load_and_preprocess_images(images_folder_train, desired_width, desired_height)
print(f"Loaded {len(images_train)} training images")

annotations_train = load_annotations(annotations_folder_train)
print(f"Loaded {len(annotations_train)} training annotations")

# Загрузка изображений и аннотаций для тестовых данных
images_test = load_and_preprocess_images(images_folder_test, desired_width, desired_height)
print(f"Loaded {len(images_test)} testing images")

annotations_test = load_annotations(annotations_folder_test)
print(f"Loaded {len(annotations_test)} testing annotations")

Loaded 10979 training images
Loaded 10979 training annotations
Loaded 1199 testing images
Loaded 1199 testing annotations


In [223]:
# Объединение изображений и аннотаций для тренировочных данных
data_train = []
for key in images_train:
    if key in annotations_train:
        data_train.append((images_train[key], annotations_train[key]))

# Объединение изображений и аннотаций для тестовых данных
data_test = []
for key in images_test:
    if key in annotations_test:
        data_test.append((images_test[key], annotations_test[key]))

In [None]:
# Пример вывода данных
print("Training Data:")
for image, annotation in data_train[:5]:  # Print first 5 examples
    print(f"Image Shape: {image.shape}, Annotation: {annotation}")

print("\nTesting Data:")
for image, annotation in data_test[:5]:  # Print first 5 examples
    print(f"Image Shape: {image.shape}, Annotation: {annotation}")

In [None]:
# Вывод первых 5 пар изображение-аннотация для проверки тренировочных данных
print("Training Data:")
for i in range(min(5, len(data_train))):  # Ensure we do not go out of range
    img, annotation = data_train[i]
    plt.imshow(img, cmap='gray')
    plt.title(annotation)
    plt.show()

# Вывод первых 5 пар изображение-аннотация для проверки тестовых данных
print("Testing Data:")
for i in range(min(5, len(data_test))):  # Ensure we do not go out of range
    img, annotation = data_test[i]
    plt.imshow(img, cmap='gray')
    plt.title(annotation)
    plt.show()

In [226]:
# Определение ожидаемой формы изображений
img_shape = data_train[0][0].shape

# Преобразование изображений и аннотаций в массивы numpy для тренировочных данных
X_train = np.stack([x[0] for x in data_train])
y_train = [x[1] for x in data_train]

# Преобразование изображений и аннотаций в массивы numpy для тестовых данных
X_test = np.stack([x[0] for x in data_test])
y_test = [x[1] for x in data_test]

In [227]:
# Масштабирование значений пикселей до диапазона [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

In [None]:
# Проверка размеров массивов
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))

In [229]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from pylatexenc.latexwalker import LatexWalker, LatexCharsNode, LatexMacroNode, LatexEnvironmentNode
import pickle

In [230]:
# Пример аннотированных LaTeX формул
latex_examples = y_train

In [231]:
# Создаем базовый словарь LaTeX символов и операторов
latex_tokens = [
    '\\frac', '\\right', '\\left', '\\sqrt', '\\mbox', '_', '\\int', '\\sin', ')', 
    '\\sum', '^', '(', '\\log', '\\lim', '\\alpha', '\\times', '\\pi', '\\beta', 
    '\\cos', '[', ']', '+', '-', '=', '\\leq', '\\geq', '\\cdot', '\\neq', 
    '\\infty', '\\partial', '\\forall', '\\exists', '\\neg', '\\lor', '\\land', 
    '\\rightarrow', '\\leftarrow', '\\uparrow', '\\downarrow', '\\circ', '\\bullet', 
    '\\perp', '\\parallel', '\\oplus', '\\otimes', '\\approx', '\\sim', '\\cong', 
    '\\equiv', '\\propto', '\\subset', '\\supset', '\\subseteq', '\\supseteq', 
    '\\cup', '\\cap', '\\setminus', '\\vdash', '\\dashv', '\\models', '\\mid', 
    '\\parallel', '\\perp', '\\simeq', '\\asymp', '\\doteq', '\\bowtie', '\\lt', 
    '\\gt', '\\prime', '\\star', '\\dagger', '\\ddagger', '\\angle', '\\measuredangle', 
    '\\triangle', '\\square', '\\diamond', '\\lozenge', '\\blacktriangle', 
    '\\blacktriangledown', '\\blacksquare', '\\blacklozenge', '\\in', '\\notin', 
    '\\ni', '\\owns', '\\to', '\\mapsto', '\\longrightarrow', '\\longmapsto', 
    '\\leftarrow', '\\hookrightarrow', '\\leftrightarrow', '\\iff', '\\implies', 
    '\\subsetneq', '\\supsetneq', '\\vdots', '\\ddots', '\\aleph', '\\beth', 
    '\\gimel', '\\daleth', '\\hbar', '\\imath', '\\jmath', '\\ell', '\\wp', 
    '\\Re', '\\Im', '\\top', '\\bot', '\\emptyset', '\\nabla', '\\surd', 
    '\\triangleleft', '\\triangleright', '\\Box', '\\Diamond', '\\vee', '\\wedge', 
    '\\lfloor', '\\rfloor', '\\lceil', '\\rceil', '\\langle', '\\rangle', 
    '\\Vert', '\\vert', '\\}', '\\{', '\\$', '\\%', '\\&', '\\_', '\\#', 
    '\\text{', '\\mathbf{', '\\mathbb{', '\\mathcal{', '\\mathfrak{', '\\mathsf{', 
    '\\mathtt{', '\\textit{', '\\textrm{', '\\textbf{', '\\textsf{', '\\texttt{', '\\lesssim', '\\gtrsim',
    '\\lessapprox', '\\gtrapprox', '\\mathbb{Z}', '\\mathbb{Q}', '\\mathbb{R}', '\\mathbb{C}', '\\overline', 
    '\\blacksquare', '\\blacksquare', '\\mathbb{S}', '\\vert', '\\Vert', ',' , '$', 
    '<', '>', '=', '\\leq', '\\geq', '\\ll', '\\gg', '\\prec', '\\succ', '\\sim', '\\approx', 
    '\\cong', '\\simeq', '\\asymp', '\\doteq', '\\equiv', '\\subset', '\\supset', '\\subseteq', 
    '\\supseteq', '\\sqsubset', '\\sqsupset', '\\sqsubseteq', '\\sqsupseteq', '\\in', 
    '\\ni', '\\notin', '\\propto', '\\parallel', '\\perp', '{', '}', '\\{', '}\\' ,
    '\\Bigg','\\ldots', '.' , '/', '\\mathrm' , '|', '\\tan', '\\pm', '\\Big', '\\cdots', '!' , 
    '\\limits', '\\div', '\'', '\\', '\\gt', ';', '\\lt', '\\rbrack', '\\lbrack', '\\dots', '\\Pi', '\\hbox', '\\vtop'
]

In [232]:
# Добавляем цифры, латинские и греческие буквы
digits = list('0123456789')
latin_lowercase = list('abcdefghijklmnopqrstuvwxyz')
latin_uppercase = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
greek_letters = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 
                 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 
                 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega']
greek_letters2 = ['gamma', 'delta', 'theta', 'lambda', 'xi', 'phi', 'psi', 'omega']
greek_lowercase = [f'\\{letter}' for letter in greek_letters]
greek_uppercase = [f'\\{letter.capitalize()}' for letter in greek_letters2]

In [233]:
latex_tokens += digits + latin_lowercase + latin_uppercase + greek_lowercase + greek_uppercase

In [None]:
print(latex_tokens)

In [235]:
# Создаем токенизатор
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(latex_tokens)


# Функция для токенизации LaTeX формулы
def custom_latex_tokenizer(formula):
    tokens = re.findall(r'\\[a-zA-Z]+|[a-zA-Z0-9]|[^\s]', formula)
    return tokens

In [236]:
# Применение токенизатора ко всем примерам
tokenized_examples = [custom_latex_tokenizer(formula) for formula in latex_examples]

# Преобразование текстов в числовые последовательности
encoded_examples = tokenizer.texts_to_sequences(tokenized_examples)

# Приведение последовательностей к одинаковой длине
max_length = max(len(seq) for seq in encoded_examples)
padded_examples = pad_sequences(encoded_examples, padding='post', maxlen=max_length)

# Обратное преобразование числовых последовательностей в текст
reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}
decoded_examples = [[reverse_word_index.get(index, '') for index in sequence] for sequence in padded_examples]

In [237]:
# Сохранение токенизатора
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [238]:
# Запись словаря токенов в файл
with open('latex_token.txt', 'w') as f:
    for word, index in tokenizer.word_index.items():
        f.write(f"{word}: {index}\n")

In [None]:
# Печать закодированных последовательностей
print("Tokenized and Encoded Sequences:")
for original, tokenized, encoded in zip(latex_examples, tokenized_examples, encoded_examples):
    print(f"Original: {original}")
    print(f"Tokenized: {tokenized}")
    print(f"Encoded: {encoded}")

In [None]:
# Печать декодированных последовательностей
with open('check_latex_decoded.txt', 'w') as f:
    print("\nDecoded Sequences:")
    for decoded in decoded_examples:
        print(' '.join(decoded))
        f.write(''.join(decoded)+'\n')

# 1. Создаем пустой массив
data_array = []

# 2. Читаем текстовый файл "check_latex_decoded.txt"
with open("check_latex_decoded.txt", "r", encoding="utf-8") as file:
    content = file.read()

# 3. Разделяем содержимое по символу новой строки и добавляем в массив
data_array = content.split('\n')

# Выводим результат
for i, line in enumerate(data_array):
    print(f"Line {i+1}: {line}")

In [247]:
# Сравнение парсированных и декодированных формул без пробелов
# Функция для удаления пробелов из LaTeX формул
def remove_spaces(formula):
    return re.sub(r'\s+','', formula)

with open("check_formula.txt", "w", encoding="utf-8") as result_file:
    for filename, original_formula in annotations_train.items():
        original_formula_no_spaces = remove_spaces(original_formula)
        decoded_formula_no_spaces = remove_spaces(data_array.pop(0))  # Pop removes the first element and returns it

        if original_formula_no_spaces != decoded_formula_no_spaces:
            print(f"Mismatch in file {filename}.inkml")
            print(f"Original: {original_formula_no_spaces}")
            print(f"Decoded: {decoded_formula_no_spaces}")
            result_file.write(filename+'.\n'+original_formula_no_spaces+'.\n'+decoded_formula_no_spaces+'.\n')
    print("Comparison complete.")

Comparison complete.


In [248]:
# Функция для создания модели
def create_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # CNN часть
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)

    # RNN часть
    x = RepeatVector(max_length)(x)
    x = LSTM(128, return_sequences=True)(x)
    x = TimeDistributed(Dense(num_classes, activation='softmax'))(x)

    model = Model(inputs, x)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Создание и компиляция модели
model = create_model((desired_width, desired_height, 1), num_classes)

In [14]:
# Определение коллбеков
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')


In [None]:
# Обучение модели
history = model.fit(
    X_train, y_train_encoded,
    validation_data=(X_test, y_test_encoded),
    epochs=5,
    batch_size=32,
    callbacks=[checkpoint, early_stopping]
)