# CROHME 2023

## Imports

In [76]:
import tensorflow as tf
import tensorflow_datasets as tfds
import keras
from keras import layers
import matplotlib.pyplot as plt
import pickle

print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))


Num GPUs Available:  1


Import `crohme_dataset`

In [77]:
import datasets.crohme_dataset  # Register `crohme_dataset`

ds = tfds.load("crohme_dataset")  # `crohme_dataset` registered
test: tf.data.Dataset = ds["test"]
train: tf.data.Dataset = ds["train"]
validation: tf.data.Dataset = ds["validation"]

### Extra: Previewing InkML Files

I also created a little utility in C++ and GTK to render out an inkml file from the dataset. It reads the InkML file, and renders out the strokes as well as the LaTeX of what it's supposed to be. It was a fun project!

In [78]:
import os

random_data_point = next(iter(validation.shuffle(200_000).take(1)))
filepath = random_data_point["filepath"].numpy().decode("ascii")
os.system(f"inkmlviewer {filepath}")

/home/jeshwinprince/Programming/crohme/datasets/crohme_dataset/data/INKML/val/CROHME2016_test/UN_130_em_1061.inkml
Displaying app now...


0

## Preprocessing

### Text Vectorization

We will use `pylatexenc` to parse the LaTeX into nodes for custom splitting

In [79]:
from pylatexenc.latexwalker import (
    LatexWalker,
    LatexMacroNode,
    LatexEnvironmentNode,
    LatexCharsNode,
    LatexGroupNode,
)

START_TOKEN, END_TOKEN = "<START>", "<END>"


# Define the tokenization function using pylatexenc
def latex_tokenizer(latex_string):
    """
    Tokenizes a LaTeX string into tokens using pylatexenc.
    """
    if not latex_string:
        return []
    walker = LatexWalker(latex_string)

    def parse_node(nodelist):
        if len(nodelist) == 0:
            return []
        try:
            tokens = []
            for node in nodelist:
                if not node:
                    continue
                elif node.isNodeType(LatexMacroNode):
                    tokens.append(f"\\{node.macroname}")
                    # Parse arguments if they exist
                    tokens += parse_node(node.nodeargd.argnlist)
                elif node.isNodeType(LatexEnvironmentNode):
                    tokens.append(f"\\begin{{{node.environmentname}}}")
                    tokens += parse_node(node.nodeargd.argnlist)
                    tokens += parse_node(node.nodelist)
                    tokens.append(f"\\end{{{node.environmentname}}}")
                elif node.isNodeType(LatexCharsNode):
                    tokens += list(node.chars)
                elif node.isNodeType(LatexGroupNode):
                    tokens.append(node.delimiters[0])
                    tokens += parse_node(node.nodelist)
                    tokens.append(node.delimiters[1])
            return tokens
        except Exception as e:
            return []

    nodelist, _, _ = walker.get_latex_nodes()
    return parse_node(nodelist)


# Wrap the tokenizer for use in TextVectorization
def tokenize_fn(latex_tensor):
    tokens = []
    for latex_string in latex_tensor:
        tokenized_string = latex_tokenizer(latex_string.numpy().decode("utf-8"))
        tokenized_string.insert(0, START_TOKEN)
        tokenized_string.append(END_TOKEN)
        tokens.append(tokenized_string)
    return tf.ragged.constant(tokens, dtype=tf.string)


# Create a TensorFlow-compatible wrapper
@tf.function
def tf_tokenizer(latex_string):
    return tf.py_function(
        func=tokenize_fn,
        inp=[latex_string],
        Tout=tf.RaggedTensorSpec([None, None], dtype=tf.string),
    )

Create the vectorizer and use a vocabulary file to adapt it

In [80]:
# Create the TextVectorization layer
max_tokens = 10000  # Adjust depending on your vocabulary size

vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    standardize=None,  # Custom tokenizer, so no built-in preprocessing
    split=tf_tokenizer,
    ragged=True,
)
dataset = tf.data.TextLineDataset("vocabulary.txt")
dataset = dataset.map(lambda line: [line])
vectorizer.adapt(dataset)

2024-12-15 17:48:18.367433: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Test it out to make sure it works properly

In [83]:
def latex_to_token(string):
    return vectorizer(string)


id_to_token = {i: token for i, token in enumerate(vectorizer.get_vocabulary())}


def token_to_latex(tokens):
    return "".join([id_to_token[id] for id in tokens.numpy()])


latex_array = [
    r"E = mc^2",
    r"\frac{a}{b} + \sqrt{c}",
    r"\sum_{i=1}^n i^2 = \frac{n(n+1)(2n+1)}{6}",
    r"A = \pi r^2",
    r"G=\begin{bmatrix}1&\dots&1&0&\dots&0\\ \ast&\ast&\ast&&G^{\prime}&\\ \end{bmatrix}",
]
latex_data = tf.constant(latex_array)

# Tokenize and vectorize
tokenized_output = latex_to_token(latex_data)
print(tokenized_output)

E_mc2 = token_to_latex(tokenized_output[0])
print(E_mc2)

<tf.RaggedTensor [[4, 60, 45, 11, 45, 30, 39, 9, 12, 5],
 [4, 20, 3, 21, 2, 3, 38, 2, 45, 18, 45, 79, 3, 39, 2, 5],
 [4, 73, 6, 3, 19, 11, 10, 2, 9, 16, 45, 19, 9, 12, 45, 11, 45, 20, 3, 16,
  7, 16, 18, 10, 8, 7, 12, 16, 18, 10, 8, 2, 3, 92, 2, 5]                 ,
 [4, 34, 45, 11, 45, 78, 27, 9, 12, 5],
 [4, 76, 11, 165, 10, 154, 10, 17, 154, 17, 75, 45, 245, 245, 245, 76, 9, 3,
  69, 2, 75, 45, 164, 5]                                                    ]>
<START>E = mc^2<END>


Save the config as a pickle just in case

In [85]:
# Save the configuration
vectorizer_config = vectorizer.get_config()

# Save the vocabulary as a list
vectorizer_vocab = vectorizer.get_vocabulary()

# Bundle the config and vocab into a dictionary
vectorizer_data = {
    "config": vectorizer_config,
    "vocab": vectorizer_vocab,
}

# Write to a pickle file
with open("notebook_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

### Preprocessing Strokes

Instead of images, this model takes in a stream of strokes, such as writing with a stylus on a tablet. Our dataset gives us a list of strokes, and each stroke is itself a list of coordinates [x, y] of the position of the stylus. Both the number of strokes and the length of each strokes changes for every value in our dataset, so we are going to pre-process the stroke data so it will be normalized (scaled to be between 0 and 1), and always fit in a tensor with shape `(64, 64, 2,)`. FOr this, I am using the [Ramer-Douglas-Peucker Algorithm](https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm) for polyline decimation.

In [70]:
def normalize_strokes(strokes: tf.RaggedTensor):
    # First, scale values to between 0.0 and 1.0
    min_vals = tf.reduce_min(strokes, axis=(0, 1))
    max_vals = tf.reduce_max(strokes, axis=(0, 1))
    normalized_strokes = tf.map_fn(
        elems=strokes,
        fn=lambda stroke: (stroke - min_vals) / (max_vals - min_vals + 1e-6),
    )

    def point_line_distance(point, start, end):
        """
        Calculate the perpendicular distance from `point` to the line segment
        defined by `start` and `end`.
        """
         # Convert to 3D by adding a zero z-component
        point_3d = tf.concat([point, tf.zeros([1], dtype=tf.float32)], axis=0)
        start_3d = tf.concat([start, tf.zeros([1], dtype=tf.float32)], axis=0)
        end_3d = tf.concat([end, tf.zeros([1], dtype=tf.float32)], axis=0)
        
        # Compute the cross product between the vectors
        cross_prod = tf.linalg.cross(end_3d - start_3d, point_3d - start_3d)
        
        # Return the perpendicular distance (norm of the cross product / norm of the line segment)
        return tf.norm(cross_prod) / tf.norm(end_3d - start_3d)

    def douglas_peucker(stroke, epsilon=0.01):
        """
        Non-recursive Douglas-Peucker algorithm implementation.
        """
        stroke_len = tf.shape(stroke)[0]
        if stroke_len < 3:
            return stroke

        # Initialize the list of points to keep
        simplified_stroke = [stroke[0]]

        # Stack for processing: Each entry contains a tuple (start_index, end_index)
        stack = [(0, stroke_len - 1)]

        while stack:
            start_idx, end_idx = stack.pop()

            # Get the relevant slice of the stroke
            sub_stroke = stroke[start_idx : end_idx + 1]

            # Calculate the perpendicular distances of all intermediate points
            start, end = sub_stroke[0], sub_stroke[-1]
            distances = tf.vectorized_map(
                lambda p: point_line_distance(p, start, end), sub_stroke[1:-1]
            )

            # Find the point with the maximum distance
            if tf.size(distances) > 0:
                max_distance = tf.reduce_max(distances)
                max_idx = tf.argmax(distances) + 1  # +1 because we skip the start point

                # If the max distance is greater than epsilon, continue splitting
                if max_distance > epsilon:
                    stack.append((start_idx, start_idx + max_idx))
                    stack.append((start_idx + max_idx, end_idx))
                else:
                    # Otherwise, keep the start and end points
                    simplified_stroke.append(end)
            else:
                # If no intermediate points exist, just keep the start and end points
                simplified_stroke.append(end)

        # Return the simplified stroke
        return tf.stack(simplified_stroke)

    downsampled_strokes = tf.map_fn(elems=normalized_strokes, fn=douglas_peucker)

    # Pad to fixed number of strokes and points per stroke
    return downsampled_strokes.to_tensor(shape=(64, 64, 2))

Let's try it out first!

In [71]:
t = next(iter(validation.take(1)))
print(normalize_strokes(t["strokes"]))

/home/jeshwinprince/Programming/crohme/datasets/crohme_dataset/data/INKML/val/CROHME2023_val/form_5_f_205_E1022.inkml
Displaying app now...
tf.Tensor(
[[[0.         0.42909095]
  [0.09121891 0.46363685]
  [0.06434508 0.4181823 ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.21158199 0.80727303]
  [0.29598787 0.9999998 ]
  [0.2853899  0.03090875]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.36866003 0.8636362 ]
  [0.36222556 0.92363656]
  [0.35011354 0.94363666]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 ...

 [[0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.         0.       

### Preprocessing datasets

Now, we can go through our datasets a preprocess all the data. We will need both our vectorizer and our stroke preprocessor together. Since we are going with an encoder-decoder model, we need input data for the decoder as well, which should be the desired output, but just missing the last token, and with the start token added in front.

In [172]:
def preprocess_data(data):
    ground_truth = vectorizer(data["ground_truth"])
    decoder_input = tf.concat([[vectorizer("[START]")[0]], ground_truth[:-1]], axis=0)
    return (normalize_strokes(data["strokes"]), decoder_input), ground_truth

In [173]:
for i, data in enumerate(test.shuffle(200_000).take(5)):
    pp_data = preprocess_data(data)
    # print(pp_data)
    print("Decoder input:", token_to_latex(pp_data[0][1]))
    print("True value:", token_to_latex(pp_data[1]))

Decoder input: [(+[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2},[UNK]+[UNK][UNK]frac{1}{2},-[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2}
True value: (+[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2},[UNK]+[UNK][UNK]frac{1}{2},-[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2})
Decoder input: [[UNK][UNK]lim_{t[UNK]rightarrow[UNK]infty}[UNK]|[UNK]gamma(t)|=[UNK]infty
True value: [UNK][UNK]lim_{t[UNK]rightarrow[UNK]infty}[UNK]|[UNK]gamma(t)|=[UNK]infty[UNK]
Decoder input: [[UNK][UNK]int[UNK]sqrt{g^{(2)}}
True value: [UNK][UNK]int[UNK]sqrt{g^{(2)}}[UNK]
Decoder input: [xdx=q^2dx
True value: xdx=q^2dxx
Decoder input: [x[UNK]y[UNK]=[UNK][UNK]sum_{j=1}^n[UNK]x_j[UNK]y_
True value: x[UNK]y[UNK]=[UNK][UNK]sum_{j=1}^n[UNK]x_j[UNK]y_j


In [174]:
test = test.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
train = train.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
validation = (
    validation.map(preprocess_data)
    .shuffle(200_000)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)

Let's test this out to make sure it worked!

In [175]:
print(next(iter(validation.take(1))))

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cannot batch tensors with different shapes in component 1. First element had shape [35] and element 1 had shape [11]. [Op:IteratorGetNext] name: 

## Model Architecture

My model is an encoder-decoder architecture, with a CNN for the encoder, a feedforward network to get to the latent space, and a LSTM RNN for the decoder.

### Encoder

In [None]:
input_strokes = layers.Input(shape=(64, 64, 2))
x = layers.Conv2D(64, kernel_size=(3, 3), padding="same")(input_strokes)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(128, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(256, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(512, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)

latent_space = layers.Dense(1024, activation="relu")(x)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)

### Decoder

In [None]:
latent_space_h = layers.Dense(256, activation="relu")(latent_space)
latent_space_c = layers.Dense(256, activation="relu")(latent_space)

decoder_input = layers.Input(shape=(None,))
decoder_embedding = layers.Embedding(input_dim=vocab_size, output_dim=128)(
    decoder_input
)
decoder_lstm = layers.LSTM(128, return_sequences=True)
decoder_output = decoder_lstm(
    decoder_embedding, initial_state=[latent_space_h, latent_space_c]
)

### Output

In [None]:
output = layers.Dense(vocab_size, activation="softmax")(decoder_output)
model = keras.Model([input_strokes, decoder_input], output)
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

## Training the Model

Now we can finally train our model! We have a ton of training and validation data to use. We'll also save the history so we can get a graph of the change in loss over each epoch.

In [None]:
history = model.fit(
    train,
    validation_data=validation,
    epochs=20,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint("model_checkpoint.h5", save_best_only=True),
    ],
)