# CROHME 2023

## Imports

In [163]:
import tensorflow as tf
import tensorflow_datasets as tfds
import keras
from keras import layers
import matplotlib.pyplot as plt
import pickle
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


Let's also import the dataset I created, called `crohme_dataset`.

In [164]:
import datasets.crohme_dataset  # Register `crohme_dataset`

ds = tfds.load("crohme_dataset")  # `crohme_dataset` registered
test: tf.data.Dataset = ds["test"]
train: tf.data.Dataset = ds["train"]
validation: tf.data.Dataset = ds["validation"]

### Extra: Previewing InkML Files

Alongside this project, I also created a little utility in C++ and GTK to render out an inkml file from the dataset. It reads the InkML file, and renders out the strokes as well as the LaTeX of what it's supposed to be. It was a fun project! Definitely useful for debugging and toy deployments!

In [165]:
import os

random_data_point = next(iter(validation.shuffle(200_000).take(1)))
filepath = random_data_point['filepath'].numpy().decode('ascii')
os.system(f"inkmlviewer {filepath}")

/home/jeshwinprince/Programming/crohme/datasets/crohme_dataset/data/INKML/val/CROHME2023_val/form_5_657_E3283.inkml
Displaying app now...


0

## Preprocessing

### Text Vectorization

In order for our model to use strings as inputs and outputs, we need to convert them into a format that they understand, which is tensors. To do this, we use TextVectorization, which takes a vocabulary of tokens and creates a function that can convert strings into a usable tensor.

I have a file called `vocabulary.txt` which contains all the LaTeX tokens I want to look for. 

In [166]:
VOCABULARY_FILE = "vocabulary.txt"
with open(VOCABULARY_FILE, "r") as f:
    vocab = [lines.strip() for lines in f if lines[0] != '%']
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

Vocabulary size: 152


I can then use this list of tokens as the vocabulary for our text vectorizer.

In [167]:
vectorizer = layers.TextVectorization(
    vocabulary=vocab,
    output_mode='int',
    standardize=None,
    split='character'
)

I'm going to save my vectorizer as a Pickle, a `.pkl` file, so that I can use it for a deployed model. 

In [168]:
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

Let's make some functions that can convert LaTex strings to tokens, and back!

In [169]:
def latex_to_token(string):
    return vectorizer(string)

id_to_token = {i: token for i, token in enumerate(vectorizer.get_vocabulary())}

def token_to_latex(tokens):
    return "".join([id_to_token[id] for id in tokens.numpy()])

### Preprocessing Strokes

This will get more complicated. Instead of images, this model takes in a stream of strokes, such as writing with a stylus on a phone or tablet. Our dataset gives us a list of strokes, and each stroke is itself a list of coordinates [x, y] of the position of the stylus. Both the number of strokes and the length of each strokes changes for every value in our dataset, so we are going to pre-process the stroke data so it will always fit in a tensor with shape `(64, 64, 2,)`. Based on observations of the dataset, there are no more than 45 strokes per input, and no more than 255 points per stroke. So we could simply pad out the variable length to `(64, 256, 2,)`. Then, we can use a sliding window of length 4 to go through all 256 points and basically smooth them out to get to our desired dimension.

 We also need to normalize our inputs, so that the inputs to our model are always between 0 and 1. This makes the computation much easier. In order to do this, we need to know the minimum and maximum values of the x and y coordinates in each input, and scale each stroke by that amount, since we still need to preserve the relative scale and position of the strokes. So we can just get the min and max and use them to normalize each point.

In [170]:
def normalize_strokes(strokes: tf.RaggedTensor, max_strokes=64, max_points_per_stroke=64, window_size=4):
    strokes = strokes.to_tensor(default_value=0.0, shape=(max_strokes, max_points_per_stroke * window_size, 2))
    min_vals = tf.reduce_min(strokes, axis=(0, 1))
    max_vals = tf.reduce_max(strokes, axis=(0, 1))
    normalized_strokes = tf.map_fn(
        elems=strokes,
        fn=lambda stroke: (stroke - min_vals) / (max_vals - min_vals + 1e-6),
    )

    # Downsample each stroke using a sliding window
    def downsample_with_sliding_window(stroke):
        # Pad stroke to ensure divisibility by window_size
        padding_needed = window_size - (tf.shape(stroke)[0] % window_size)
        padding_needed = tf.cond(
            padding_needed == window_size, lambda: 0, lambda: padding_needed
        )
        padded_stroke = tf.pad(
            stroke, [[0, padding_needed], [0, 0]], constant_values=0.0
        )

        # Reshape into (num_windows, window_size, 2) and compute mean over window_size
        reshaped = tf.reshape(padded_stroke, [-1, window_size, 2])
        downsampled = tf.reduce_mean(reshaped, axis=1)

        # If the resulting downsampled stroke is too long, truncate to max_points_per_stroke
        return downsampled[:max_points_per_stroke]

    downsampled_strokes = tf.map_fn(
        elems=normalized_strokes, fn=downsample_with_sliding_window
    )

    # Pad to fixed number of strokes and points per stroke
    return downsampled_strokes

Let's try it out first!

In [171]:
t = next(iter(validation.shuffle(200_000).take(1)))
print(normalize_strokes(t["strokes"]))

tf.Tensor(
[[[0.59055555 0.63455886]
  [0.5952778  0.70147055]
  [0.6022222  0.7073529 ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.6736111  0.63014704]
  [0.69305557 0.6242647 ]
  [0.71833336 0.625     ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.68416667 0.7264706 ]
  [0.71       0.7220588 ]
  [0.73083335 0.7205882 ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 ...

 [[0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]
  ...
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.        ]]]

### Preprocessing datasets

Now, we can go through our datasets a preprocess all the data. We will need both our vectorizer and our stroke preprocessor together. Since we are going with an encoder-decoder model, we need input data for the decoder as well, which should be the desired output, but just missing the last token, and with the start token added in front.

In [172]:
def preprocess_data(data):
    ground_truth = vectorizer(data['ground_truth'])
    decoder_input = tf.concat([[vectorizer("[START]")[0]], ground_truth[:-1]], axis=0)
    return (normalize_strokes(data['strokes']), decoder_input), ground_truth

In [173]:
for i, data in enumerate(test.shuffle(200_000).take(5)):
    pp_data = preprocess_data(data)
    # print(pp_data)
    print("Decoder input:", token_to_latex(pp_data[0][1]))
    print("True value:", token_to_latex(pp_data[1]))

Decoder input: [(+[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2},[UNK]+[UNK][UNK]frac{1}{2},-[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2}
True value: (+[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2},[UNK]+[UNK][UNK]frac{1}{2},-[UNK][UNK]frac{1}{2},[UNK]-[UNK][UNK]frac{1}{2})
Decoder input: [[UNK][UNK]lim_{t[UNK]rightarrow[UNK]infty}[UNK]|[UNK]gamma(t)|=[UNK]infty
True value: [UNK][UNK]lim_{t[UNK]rightarrow[UNK]infty}[UNK]|[UNK]gamma(t)|=[UNK]infty[UNK]
Decoder input: [[UNK][UNK]int[UNK]sqrt{g^{(2)}}
True value: [UNK][UNK]int[UNK]sqrt{g^{(2)}}[UNK]
Decoder input: [xdx=q^2dx
True value: xdx=q^2dxx
Decoder input: [x[UNK]y[UNK]=[UNK][UNK]sum_{j=1}^n[UNK]x_j[UNK]y_
True value: x[UNK]y[UNK]=[UNK][UNK]sum_{j=1}^n[UNK]x_j[UNK]y_j


In [174]:
test = test.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
train = train.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
validation = validation.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)

Let's test this out to make sure it worked!

In [175]:
print(next(iter(validation.take(1))))

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cannot batch tensors with different shapes in component 1. First element had shape [35] and element 1 had shape [11]. [Op:IteratorGetNext] name: 

## Model Architecture

My model is an encoder-decoder architecture, with a CNN for the encoder, a feedforward network to get to the latent space, and a LSTM RNN for the decoder.

### Encoder

In [None]:
input_strokes = layers.Input(shape=(64, 64, 2))
x = layers.Conv2D(64, kernel_size=(3, 3), padding="same")(input_strokes)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(128, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(256, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(512, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)

latent_space = layers.Dense(1024, activation='relu')(x)
latent_space = layers.Dense(512, activation='relu')(latent_space)
latent_space = layers.Dense(512, activation='relu')(latent_space)
latent_space = layers.Dense(512, activation='relu')(latent_space)
latent_space = layers.Dense(512, activation='relu')(latent_space)

### Decoder

In [None]:
latent_space_h = layers.Dense(256, activation='relu')(latent_space)
latent_space_c = layers.Dense(256, activation='relu')(latent_space)

decoder_input = layers.Input(shape=(None,))
decoder_embedding = layers.Embedding(input_dim=vocab_size, output_dim=128)(decoder_input)
decoder_lstm = layers.LSTM(128, return_sequences=True)
decoder_output = decoder_lstm(decoder_embedding, initial_state=[latent_space_h, latent_space_c])

### Output

In [None]:
output = layers.Dense(vocab_size, activation='softmax')(decoder_output)
model = keras.Model([input_strokes, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Training the Model

Now we can finally train our model! We have a ton of training and validation data to use. We'll also save the history so we can get a graph of the change in loss over each epoch.

In [None]:
history = model.fit(
    train,
    validation_data=validation,
    epochs=20,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint("model_checkpoint.h5", save_best_only=True),
    ],
)