## Character Recognition - CROHME 2023 dataset

To convert handwritten math equations in an image as latex code

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import os
import cv2
from transformers import TFViTModel

2024-12-09 19:48:55.815185: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-09 19:48:55.821368: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-09 19:48:55.833569: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-09 19:48:55.852812: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-09 19:48:55.857570: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 19:48:55.875923: I tensorflow/core/platform/cpu_feature_gu

In [2]:
# hyperparameters
IMG_SIZE = 224
BATCH_SIZE = 32
LEARNING_RATE = 0.001
MAX_SEQ_LEN = 50
AMOUNT_CHANNELS = 3

In [3]:
import xml.etree.ElementTree as ET
missing_attrib_counter = 0
def parse_inkml(inkml_path):
    """
    Parse an .inkml file to extract the corresponding LaTeX label and image name.
    """
    global missing_attrib_counter
    tree = ET.parse(inkml_path)
    root = tree.getroot()

    image_name = None
    latex_code = None
    
    try:
        image_name = os.path.splitext(os.path.basename(inkml_path))[0]
        latex_code = root.find(".//{http://www.w3.org/2003/InkML}annotation[@type='truth']").text
    except AttributeError:
        missing_attrib_counter += 1
        print(missing_attrib_counter)

    return image_name, latex_code

    
def extract_labels_from_inkml(inkml_dir):
    """
    Extract labels for all .inkml files in a directory.
    """
    latex_label_map = {}
    for file in os.listdir(inkml_dir):
        if file.endswith(".inkml"):
            inkml_path = os.path.join(inkml_dir, file)
            image_name, latex_code = parse_inkml(inkml_path)
            if image_name is None or latex_code is None:
                continue
            
            latex_label_map[image_name] = latex_code
            
    return latex_label_map

In [4]:
def match_images_with_labels(img_dir, latex_label_map):
    """
    Match .png images in img_dir with their LaTeX labels using the UI key.
    """
    matched_data = []
    for img_file in os.listdir(img_dir):
        if img_file.endswith(".png"):
            image_name = os.path.splitext(img_file)[0]  # Strip .png extension
            if image_name in latex_label_map:
                matched_data.append((os.path.join(img_dir, img_file), latex_label_map[image_name]))

    return matched_data

In [5]:
# Function to load and preprocess images
def load_image(img_path):
    img = tf.io.read_file(img_path)  # Read the image file
    img = tf.image.decode_image(img, channels=3)  # Decode the image (grayscale)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])  # Resize the image
    img = img / 255.0  # Normalize the image
    return img

# Function to tokenize and pad labels
def preprocess_label(label, tokenizer, max_len):
    # Tokenize the label
    tokens = tokenizer.encode(label)
    # Pad the label
    padded_tokens = tf.keras.preprocessing.sequence.pad_sequences(
        [tokens], padding='post', maxlen=max_len
    )
    return padded_tokens[0]  # Return the first (and only) batch

# Convert your data into a TensorFlow dataset
def create_tf_dataset(matched_data, tokenizer, batch_size=32, max_len=MAX_SEQ_LEN, img_size=224):
    image_paths, labels = zip(*matched_data)

    images = [load_image(image_path) for image_path in image_paths]
    tokenized_labels = [preprocess_label(label, tokenizer, max_len) for label in labels]
    
    # Create the TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices((images, tokenized_labels))

    dataset = dataset.batch(batch_size)
    dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)  # Prefetch for better performance

    return dataset

In [6]:
inkml_train_dir = "CROHME23/TC11_CROHME23/INKML/train/CROHME2023_train"
train_labels = extract_labels_from_inkml(inkml_train_dir)

img_train_dir = "CROHME23/TC11_CROHME23/IMG/train/CROHME2013_train" # typo in dir name: is 2023 data as well
matched_train_data = match_images_with_labels(img_train_dir, train_labels)


inkml_val_dir = "CROHME23/TC11_CROHME23/INKML/val/CROHME2023_val"
val_labels = extract_labels_from_inkml(inkml_val_dir)

img_val_dir = "CROHME23/TC11_CROHME23/IMG/val/CROHME2023_val"
matched_val_data = match_images_with_labels(img_val_dir, val_labels)


inkml_test_dir = "CROHME23/TC11_CROHME23/INKML/test/CROHME2023_test"
test_labels = extract_labels_from_inkml(inkml_test_dir)

img_test_dir = "CROHME23/TC11_CROHME23/IMG/test/CROHME2023_test"
matched_test_data = match_images_with_labels(img_test_dir, test_labels)

In [7]:
from transformers import AutoTokenizer

# TODO: check if specialized LaTeX tokenizer exists
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset = create_tf_dataset(matched_train_data, tokenizer, batch_size=BATCH_SIZE, max_len=MAX_SEQ_LEN)
val_dataset = create_tf_dataset(matched_val_data, tokenizer, batch_size=BATCH_SIZE, max_len=MAX_SEQ_LEN)
test_dataset = create_tf_dataset(matched_test_data, tokenizer, batch_size=BATCH_SIZE, max_len=MAX_SEQ_LEN)

I0000 00:00:1733770141.747015  140244 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-09 19:49:01.747357: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-12-09 19:49:44.674853: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1384857600 exceeds 10% of free system memory.


In [8]:
latex = r"\frac{1}{2}"
print(tokenizer.encode(latex))
print(tokenizer.decode(tokenizer.encode(latex)))

[101, 1032, 25312, 2278, 1063, 1015, 1065, 1063, 1016, 1065, 102]
[CLS] \ frac { 1 } { 2 } [SEP]


In [9]:
# load pretrained Vision Transformer
vit = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
patch_size = 16
seq_len = (IMG_SIZE // patch_size)**2 + 1
hidden_size = 768
output_shape_model = (seq_len, hidden_size)

def preprocess(inputs):
    # Scale pixel values to the range [0, 1] and normalize using "torch" mode
    inputs = tf.keras.applications.imagenet_utils.preprocess_input(inputs, mode="torch")
    # Convert KerasTensor to TensorFlow Tensor explicitly
    inputs = tf.convert_to_tensor(inputs)
    return inputs

# TODO: test if grayscale gives better results than rgb
def get_vit_encoder():
    input_layer = tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, AMOUNT_CHANNELS), name="image_input")

    # Preprocess inputs using the updated function
    # transpose because for some reason model flips the order, so tranposing here to compensate
    processed_inputs = tf.keras.layers.Lambda(lambda x: tf.transpose(x, perm=[0, 3, 1, 2]))(input_layer)
    print("here:", processed_inputs)
    
    # Ensure inputs are compatible with TFViTModel
    outputs = tf.keras.layers.Lambda(lambda x: vit(x).last_hidden_state, output_shape=output_shape_model)(processed_inputs)

    return tf.keras.Model(inputs=input_layer, outputs=outputs, name="vit_encoder")

All PyTorch model weights were used when initializing TFViTModel.

All the weights of TFViTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTModel for predictions without further training.


In [10]:
def get_decoder(vocab_size, hidden_size):
    decoder_inputs = layers.Input(shape=(None,))
    encoder_outputs = layers.Input(shape=(None, hidden_size))  # ViT feature size

    embedding = layers.Embedding(input_dim=vocab_size, output_dim=hidden_size)(decoder_inputs)
    gru_output = layers.GRU(hidden_size, return_sequences=True)(embedding, initial_state=None)
    outputs = layers.Dense(vocab_size, activation="softmax")(gru_output)

    return tf.keras.Model(inputs=[decoder_inputs, encoder_outputs], outputs=outputs, name="decoder")

In [11]:
def build_model(vocab_size):
    encoder = get_vit_encoder()
    decoder = get_decoder(vocab_size, hidden_size=512)

    image_inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, AMOUNT_CHANNELS))
    token_inputs = layers.Input(shape=(None,))

    encoder_outputs = encoder(image_inputs)
    encoder_outputs = layers.Dense(512)(encoder_outputs)
    decoder_outputs = decoder([token_inputs, encoder_outputs])

    return tf.keras.Model(inputs=[image_inputs, token_inputs], outputs=decoder_outputs)

In [12]:
model = build_model(vocab_size=len(tokenizer.vocab))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

here: <KerasTensor shape=(None, 3, 224, 224), dtype=float32, sparse=False, name=keras_tensor_2>


In [13]:
# Modify dataset to return both inputs as required by model
def modify_dataset_for_model(dataset):
    return dataset.map(lambda img, label: ((img, label), label))

train_dataset_model = modify_dataset_for_model(train_dataset)
val_dataset_model = modify_dataset_for_model(val_dataset)
test_dataset_model = modify_dataset_for_model(test_dataset)

In [14]:
train_model = False
models_path = "../models/CROHME"

if train_model:
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )
    
    history = model.fit(
        train_dataset_model,
        validation_data=val_dataset_model,
        epochs=50,
        callbacks=[early_stopping]
    )

else:
    history = None
    latest_version = sorted(os.listdir(models_path), reverse=True)[0]
    latest_version_path = os.path.join(models_path, latest_version)
    model.load_weights(latest_version_path)

In [15]:
counter = 1

def is_existing_file(path, counter):
    return os.path.isfile(os.path.join(path, f"char-recog-model_crohme_{counter:03d}.weights.h5"))

while is_existing_file(models_path, counter):
    counter += 1
    
model.save_weights(os.path.join(models_path, f"char-recog-model_crohme_{counter:03d}.weights.h5"))

### Evaluate Model

In [16]:
import pandas as pd
!pip install matplotlib
import matplotlib.pyplot as plt
if history is not None:
    history_frame = pd.DataFrame(history.history)
    history_frame.loc[:, ['loss', 'val_loss']].plot()
    history_frame.loc[:, ['accuracy', 'val_accuracy']].plot()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [17]:
loss, accuracy = model.evaluate(test_dataset_model)
print(loss, accuracy)
print(test_dataset_model)

2024-12-09 19:49:49.087488: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1384857600 exceeds 10% of free system memory.


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 736ms/step - accuracy: 0.9952 - loss: 0.0682
0.07104132324457169 0.9952083826065063
<_MapDataset element_spec=((TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 50), dtype=tf.int32, name=None)), TensorSpec(shape=(None, 50), dtype=tf.int32, name=None))>


In [19]:
for image_label_tuple, label in test_dataset_model:
    predictions = model.predict(image_label_tuple)
    print(predictions.shape)
    predicted_tokens = np.argmax(predictions, axis=-1)
    print(tokenizer.decode(predicted_tokens.flatten()))
    break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
(32, 50, 30522)
[CLS] a ^ 2 - b ^ 2 = 4 \ pi ( 2n + 1 ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] h _ { xx } = - h _ { yy } \ neq0 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] - b, t abc z abc b - t [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] f ( z ) = \ log | z _ 1 - z | ^ 2 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [20]:
def evaluate_autoregressive(model, dataset, start_token, end_token, max_length=MAX_SEQ_LEN):
    correct = 0
    total = 0

    # Encode the start and end tokens
    start_token_encoded = tokenizer.encode(start_token)[0]
    end_token_encoded = tokenizer.encode(end_token)[0]
    
    for (image_inputs, true_tokens) in dataset:
        batch_size = image_inputs.shape[0]
        
        # Initialize generated tokens as integers
        generated_tokens = tf.fill((batch_size, 1), start_token_encoded)  # Start token as an ID

        # Pass image inputs through the encoder
        encoder_outputs = model.get_layer("vit_encoder")(image_inputs)
        
        # Apply the extra Dense layer to match the decoder input shape
        encoder_outputs = model.get_layer("dense_1")(encoder_outputs)  # Apply Dense layer
        
        # Autoregressive decoding loop
        for _ in range(max_length):
            # Predict the next token
            decoder_outputs = model.get_layer("decoder")([generated_tokens, encoder_outputs])
            next_tokens = tf.argmax(decoder_outputs[:, -1, :], axis=-1, output_type=tf.int32)
            print(next_tokens)
            
            generated_tokens = tf.concat([generated_tokens, tf.expand_dims(next_tokens, axis=-1)], axis=-1)

            # Stop if all sequences have reached the end token
            if tf.reduce_all(next_tokens == end_token_encoded):
                break

        # Compare generated tokens with true tokens (excluding padding and special tokens)
        for gen, true in zip(generated_tokens.numpy(), true_tokens.numpy()):
            # Strip padding and end tokens for comparison
            gen = gen[:np.where(gen == end_token_encoded)[0][0]] if end_token_encoded in gen else gen
            true = true[:np.where(true == end_token_encoded)[0][0]] if end_token_encoded in true else true
            if np.array_equal(gen, true):
                correct += 1
            total += 1

    accuracy = correct / total
    print(f"Autoregressive accuracy: {accuracy * 100:.2f}%")
    return accuracy

In [21]:
START_TOKEN = tokenizer.cls_token # [CLS]
END_TOKEN = tokenizer.sep_token # [SEP]

# Evaluate
evaluate_autoregressive(model, test_dataset, START_TOKEN, END_TOKEN)

2024-12-09 19:51:08.621598: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1384857600 exceeds 10% of free system memory.


tf.Tensor(
[101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101
 101 101 101 101 101 101 101 101 101 101 101 101 101 101], shape=(32,), dtype=int32)
tf.Tensor(
[101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101
 101 101 101 101 101 101 101 101 101 101 101 101 101 101], shape=(32,), dtype=int32)


KeyboardInterrupt: 

In [None]:
def predict_for_single_image(model, image_input, start_token, end_token, max_length=MAX_SEQ_LEN):
    # Encode the start and end tokens
    start_token_encoded = tokenizer.encode(start_token)[0]
    end_token_encoded = tokenizer.encode(end_token)[0]
    
    # Initialize generated tokens for a single image (shape: [1, 1] for a single token)
    generated_tokens = tf.fill((1, 1), start_token_encoded)  # Start token as an ID
    
    # Pass the single image input through the encoder
    encoder_outputs = model.get_layer("vit_encoder")(image_input)
    
    # Apply the extra Dense layer to match the decoder input shape
    encoder_outputs = model.get_layer("dense_1")(encoder_outputs)  # Apply Dense layer
    
    # Autoregressive decoding loop for a single image
    for _ in range(max_length):
        # Predict the next token
        decoder_outputs = model.get_layer("decoder")([generated_tokens, encoder_outputs])
        next_tokens = tf.argmax(decoder_outputs[:, -1, :], axis=-1, output_type=tf.int32)
        generated_tokens = tf.concat([generated_tokens, tf.expand_dims(next_tokens, axis=-1)], axis=-1)

        # Stop if the end token is predicted
        if tf.reduce_all(next_tokens == end_token_encoded):
            break

    print(generated_tokens)
    generated_text = tokenizer.decode(generated_tokens.numpy().flatten())
    
    return generated_text

In [None]:
image_test = load_image("CROHME23/TC11_CROHME23/IMG/test/CROHME2023_test/form_5_209_E1042.png")
image_test = tf.expand_dims(image_test, axis=0)
print(predict_for_single_image(model, image_test, START_TOKEN, END_TOKEN))