# CROHME 2023

## Imports

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import keras
from keras import layers
import matplotlib.pyplot as plt

print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))


2024-12-16 12:03:54.245214: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-16 12:03:54.353391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734379434.409697    5443 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734379434.422863    5443 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 12:03:54.541949: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Num GPUs Available:  1


Import `crohme_dataset`

In [2]:
import datasets.crohme_dataset  # Register `crohme_dataset`

ds = tfds.load("crohme_dataset")  # `crohme_dataset` registered
test: tf.data.Dataset = ds["test"]
train: tf.data.Dataset = ds["train"]
validation: tf.data.Dataset = ds["validation"]

I0000 00:00:1734379437.562450    5443 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5485 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6


### Extra: Previewing InkML Files

I also created a little utility in C++ and GTK to render out an inkml file from the dataset. It reads the InkML file, and renders out the strokes as well as the LaTeX of what it's supposed to be. It was a fun project!

In [3]:
import os

random_data_point = next(iter(validation.shuffle(200_000).take(1)))
filepath = random_data_point["filepath"].numpy().decode("ascii")
os.system(f"inkmlviewer {filepath}")

2024-12-16 12:04:01.589981: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608


/home/jeshwinprince/Programming/crohme/datasets/crohme_dataset/data/INKML/val/CROHME2023_val/form_045_E360.inkml
Displaying app now...


0

## Preprocessing

### Text Vectorization

We will use `pylatexenc` to parse the LaTeX into nodes for custom splitting

In [17]:
from pylatexenc.latexwalker import (
    LatexWalker,
    LatexMacroNode,
    LatexEnvironmentNode,
    LatexCharsNode,
    LatexGroupNode,
)

START_TOKEN, END_TOKEN = "<START>", "<END>"


# Define the tokenization function using pylatexenc
def latex_tokenizer(latex_string):
    """
    Tokenizes a LaTeX string into tokens using pylatexenc.
    """
    if not latex_string:
        return []
    walker = LatexWalker(latex_string)

    def parse_node(nodelist):
        if len(nodelist) == 0:
            return []
        try:
            tokens = []
            for node in nodelist:
                if not node:
                    continue
                elif node.isNodeType(LatexMacroNode):
                    tokens.append(f"\\{node.macroname}")
                    # Parse arguments if they exist
                    tokens += parse_node(node.nodeargd.argnlist)
                elif node.isNodeType(LatexEnvironmentNode):
                    tokens.append(f"\\begin{{{node.environmentname}}}")
                    tokens += parse_node(node.nodeargd.argnlist)
                    tokens += parse_node(node.nodelist)
                    tokens.append(f"\\end{{{node.environmentname}}}")
                elif node.isNodeType(LatexCharsNode):
                    tokens += list(node.chars)
                elif node.isNodeType(LatexGroupNode):
                    tokens.append(node.delimiters[0])
                    tokens += parse_node(node.nodelist)
                    tokens.append(node.delimiters[1])
            return tokens
        except Exception as e:
            return []

    nodelist, _, _ = walker.get_latex_nodes()
    return parse_node(nodelist)


# Wrap the tokenizer for use in TextVectorization
def tokenize_fn(latex_tensor):
    tokens = []
    for latex_string in latex_tensor:
        tokenized_string = latex_tokenizer(latex_string.numpy().decode("utf-8"))
        tokenized_string.insert(0, START_TOKEN)
        tokenized_string.append(END_TOKEN)
        tokens.append(tokenized_string)
    return tf.ragged.constant(tokens, dtype=tf.string)


# Create a TensorFlow-compatible wrapper
def tf_tokenizer(latex_string):
    return tf.py_function(
        func=tokenize_fn,
        inp=[latex_string],
        Tout=tf.RaggedTensorSpec([None, None], dtype=tf.string),
    )

Create the vectorizer and use a vocabulary file to adapt it

In [16]:
# Create the TextVectorization layer
max_tokens = 10_000  # Adjust depending on your vocabulary size

vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    standardize=None,  # Custom tokenizer, so no built-in preprocessing
    split=tf_tokenizer,
    ragged=True,
)
dataset = tf.data.TextLineDataset("vocabulary.txt")
dataset = dataset.map(lambda line: [line])
vectorizer.adapt(dataset)

KeyboardInterrupt: 

Test it out to make sure it works properly

In [6]:
def latex_to_token(string):
    return vectorizer(string)


id_to_token = {i: token for i, token in enumerate(vectorizer.get_vocabulary())}


def token_to_latex(tokens):
    return "".join([id_to_token[id] for id in tokens.numpy()])


latex_array = [
    r"E = mc^2",
    r"\frac{a}{b} + \sqrt{c}",
    r"\sum_{i=1}^n i^2 = \frac{n(n+1)(2n+1)}{6}",
    r"A = \pi r^2",
    r"G=\begin{bmatrix}1&\dots&1&0&\dots&0\\ \ast&\ast&\ast&&G^{\prime}&\\ \end{bmatrix}",
]
latex_data = tf.constant(latex_array)

# Tokenize and vectorize
tokenized_output = latex_to_token(latex_data)
print(tokenized_output)

E_mc2 = token_to_latex(tokenized_output[0])
print(E_mc2)

<tf.RaggedTensor [[4, 60, 45, 11, 45, 30, 39, 9, 12, 5],
 [4, 20, 3, 21, 2, 3, 38, 2, 45, 18, 45, 79, 3, 39, 2, 5],
 [4, 73, 6, 3, 19, 11, 10, 2, 9, 16, 45, 19, 9, 12, 45, 11, 45, 20, 3, 16,
  7, 16, 18, 10, 8, 7, 12, 16, 18, 10, 8, 2, 3, 92, 2, 5]                 ,
 [4, 34, 45, 11, 45, 78, 27, 9, 12, 5],
 [4, 76, 11, 165, 10, 154, 10, 17, 154, 17, 75, 45, 245, 245, 245, 76, 9, 3,
  69, 2, 75, 45, 164, 5]                                                    ]>
<START>E = mc^2<END>


Save the vocabulary just in case. You would still need the custom latex parser function imported to use it in another project

In [12]:
with open("vectorizer_vocabulary.txt", "w") as f:
    for word in vectorizer.get_vocabulary():
        f.write(word)
        f.write("\n")

To re-create the vectorizer using the vocabulary, run the following code

In [18]:
max_tokens = 10_000
vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    standardize=None,  # Custom tokenizer, so no built-in preprocessing
    split=tf_tokenizer,
    ragged=True,
)
with open("vectorizer_vocabulary.txt", "r") as f:
    lines = [line[:-1] for line in f]
    vectorizer.set_vocabulary(lines)

Test it out to make sure it works properly

In [19]:
def latex_to_token(string):
    return vectorizer(string)


id_to_token = {i: token for i, token in enumerate(vectorizer.get_vocabulary())}


def token_to_latex(tokens):
    return "".join([id_to_token[id] for id in tokens.numpy()])


latex_array = [
    r"E = mc^2",
    r"\frac{a}{b} + \sqrt{c}",
    r"\sum_{i=1}^n i^2 = \frac{n(n+1)(2n+1)}{6}",
    r"A = \pi r^2",
    r"G=\begin{bmatrix}1&\dots&1&0&\dots&0\\ \ast&\ast&\ast&&G^{\prime}&\\ \end{bmatrix}",
]
latex_data = tf.constant(latex_array)

# Tokenize and vectorize
tokenized_output = latex_to_token(latex_data)
print(tokenized_output)

E_mc2 = token_to_latex(tokenized_output[0])
print(E_mc2)

<tf.RaggedTensor [[4, 60, 45, 11, 45, 30, 39, 9, 12, 5],
 [4, 20, 3, 21, 2, 3, 38, 2, 45, 18, 45, 79, 3, 39, 2, 5],
 [4, 73, 6, 3, 19, 11, 10, 2, 9, 16, 45, 19, 9, 12, 45, 11, 45, 20, 3, 16,
  7, 16, 18, 10, 8, 7, 12, 16, 18, 10, 8, 2, 3, 92, 2, 5]                 ,
 [4, 34, 45, 11, 45, 78, 27, 9, 12, 5],
 [4, 76, 11, 165, 10, 154, 10, 17, 154, 17, 75, 45, 245, 245, 245, 76, 9, 3,
  69, 2, 75, 45, 164, 5]                                                    ]>
<START>E = mc^2<END>


Try an actual data point

In [20]:
random_data_point = next(iter(validation.take(1)))
tokenized_data_point = latex_to_token([random_data_point['ground_truth'].numpy()])
print(tokenized_data_point)
detokenized_data_point = token_to_latex(tokenized_data_point[0])
print(detokenized_data_point)

<tf.RaggedTensor [[4, 15, 10, 29, 127, 77, 77, 5]]>
<START>-1.955<END>


### Preprocessing Strokes

Instead of images, this model takes in a stream of strokes, such as writing with a stylus on a tablet. Our dataset gives us a list of strokes, and each stroke is itself a list of coordinates [x, y] of the position of the stylus. Both the number of strokes and the length of each strokes changes for every value in our dataset, so we are going to pre-process the stroke data so it will be normalized (scaled to be between 0 and 1), and always fit in a tensor with shape `(64, 64, 2,)`. FOr this, I am using the [Ramer-Douglas-Peucker Algorithm](https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm) for polyline decimation.

In [58]:
@tf.function
def preprocess_strokes(strokes: tf.RaggedTensor):
    # First, scale values to between 0.0 and 1.0
    min_vals = tf.reduce_min(strokes, axis=(0, 1))
    max_vals = tf.reduce_max(strokes, axis=(0, 1))
    normalized_strokes = tf.map_fn(
        elems=strokes,
        fn=lambda stroke: (stroke - min_vals) / (max_vals - min_vals + 1e-6),
    )

    @tf.function
    def point_line_distance(point, start, end):
        """
        Calculate the perpendicular distance from `point` to the line segment
        defined by `start` and `end`.
        """
        # Convert to 3D by adding a zero z-component
        point_3d = tf.concat([point, tf.zeros([1], dtype=tf.float32)], axis=0)
        start_3d = tf.concat([start, tf.zeros([1], dtype=tf.float32)], axis=0)
        end_3d = tf.concat([end, tf.zeros([1], dtype=tf.float32)], axis=0)

        # Return the perpendicular distance (norm of the cross product / norm of the line segment)
        return tf.norm(
            tf.linalg.cross(end_3d - start_3d, point_3d - start_3d)
        ) / tf.norm(end_3d - start_3d)

    @tf.function
    def douglas_peucker(stroke, epsilon=0.02):
        """
        Non-recursive Douglas-Peucker algorithm implementation.
        """
        stroke_len = tf.shape(stroke)[0]
        if stroke_len < 3:
            return stroke

        # Initialize the list of points to keep
        simplified_stroke: tf.TensorArray = tf.TensorArray(
            dtype=stroke.dtype, size=0, dynamic_size=True
        )
        simplified_stroke = simplified_stroke.write(0, stroke[0])

        # Stack for processing: Each entry contains a tuple (start_index, end_index)
        stack: tf.TensorArray = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True, clear_after_read=True)
        stack = stack.write(0, [0, stroke_len - 1])

        while stack.size() > 0:
            tf.print(stack)
            tf.print(simplified_stroke)
            # Pop from the stack
            border_idxs = stack.read(stack.size() - 1)
            start_idx, end_idx = border_idxs[0], border_idxs[1]

            # Get the relevant slice of the stroke
            sub_stroke = stroke[start_idx : end_idx + 1]

            # Calculate the perpendicular distances of all intermediate points
            start, end = sub_stroke[0], sub_stroke[-1]
            distances = tf.vectorized_map(
                elems=sub_stroke[1:-1], fn=lambda p: point_line_distance(p, start, end)
            )

            # Find the point with the maximum distance
            if tf.size(distances) > 0:
                max_distance = tf.reduce_max(distances)
                max_idx = tf.argmax(distances, output_type=tf.int32) + 1  # +1 because we skip the start point

                # If the max distance is greater than epsilon, continue splitting
                if max_distance > epsilon:
                    stack = stack.write(stack.size(), [start_idx, start_idx + max_idx])
                    stack = stack.write(stack.size(), [start_idx + max_idx, end_idx])
                else:
                    # Otherwise, keep the start and end points
                    simplified_stroke = simplified_stroke.write(simplified_stroke.size(), end)            
            else:
                # If no intermediate points exist, just keep the start and end points
                simplified_stroke = simplified_stroke.write(simplified_stroke.size(), end)

        # Return the simplified stroke
        return simplified_stroke.stack()

    downsampled_strokes = tf.map_fn(elems=normalized_strokes, fn=douglas_peucker)
    return downsampled_strokes

Let's try it out first!

In [57]:
random_data_point = next(iter(validation.take(1)))
print(preprocess_strokes(random_data_point["strokes"]))
filepath = random_data_point["filepath"].numpy().decode("ascii")
os.system(f"inkmlviewer {filepath}")





ERROR:tensorflow:Got error while pfor was converting op name: "cond/while/cond/cond"
op: "StatelessIf"
input: "cond/while/cond/Greater"
input: "cond/while/cond/cond/cond/while/Placeholder_1"
input: "cond/while/cond/cond/cond/while/strided_slice"
input: "cond/while/cond/add"
input: "cond/while/cond/cond/cond/while/strided_slice_1"
input: "cond/while/cond/cond/cond/while/Placeholder"
input: "cond/while/cond/cond/cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_cond_true_243993"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_cond_false_243994"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT


ERROR:tensorflow:Got error while pfor was converting op name: "cond/while/cond/cond"
op: "StatelessIf"
input: "cond/while/cond/Greater"
input: "cond/while/cond/cond/cond/while/Placeholder_1"
input: "cond/while/cond/cond/cond/while/strided_slice"
input: "cond/while/cond/add"
input: "cond/while/cond/cond/cond/while/strided_slice_1"
input: "cond/while/cond/cond/cond/while/Placeholder"
input: "cond/while/cond/cond/cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_cond_true_243993"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_cond_false_243994"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT


ERROR:tensorflow:name: "cond/while/cond/cond"
op: "StatelessIf"
input: "cond/while/cond/Greater"
input: "cond/while/cond/cond/cond/while/Placeholder_1"
input: "cond/while/cond/cond/cond/while/strided_slice"
input: "cond/while/cond/add"
input: "cond/while/cond/cond/cond/while/strided_slice_1"
input: "cond/while/cond/cond/cond/while/Placeholder"
input: "cond/while/cond/cond/cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_cond_true_243993"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_cond_false_243994"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value

ERROR:tensorflow:name: "cond/while/cond/cond"
op: "StatelessIf"
input: "cond/while/cond/Greater"
input: "cond/while/cond/cond/cond/while/Placeholder_1"
input: "cond/while/cond/cond/cond/while/strided_slice"
input: "cond/while/cond/add"
input: "cond/while/cond/cond/cond/while/strided_slice_1"
input: "cond/while/cond/cond/cond/while/Placeholder"
input: "cond/while/cond/cond/cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_cond_true_243993"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_cond_false_243994"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value

ERROR:tensorflow:Got error while pfor was converting op name: "cond/while/cond"
op: "StatelessIf"
input: "cond/while/Greater"
input: "cond/while/loop_body/PartitionedCall/pfor/PartitionedCall"
input: "cond/while/Placeholder_1"
input: "cond/while/strided_slice"
input: "cond/while/strided_slice_1"
input: "cond/while/Placeholder"
input: "cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_true_243981"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_false_243982"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: D

ERROR:tensorflow:Got error while pfor was converting op name: "cond/while/cond"
op: "StatelessIf"
input: "cond/while/Greater"
input: "cond/while/loop_body/PartitionedCall/pfor/PartitionedCall"
input: "cond/while/Placeholder_1"
input: "cond/while/strided_slice"
input: "cond/while/strided_slice_1"
input: "cond/while/Placeholder"
input: "cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_true_243981"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_false_243982"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: D

ERROR:tensorflow:name: "cond/while/cond"
op: "StatelessIf"
input: "cond/while/Greater"
input: "cond/while/loop_body/PartitionedCall/pfor/PartitionedCall"
input: "cond/while/Placeholder_1"
input: "cond/while/strided_slice"
input: "cond/while/strided_slice_1"
input: "cond/while/Placeholder"
input: "cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_true_243981"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_false_243982"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_VARIANT
      type: DT_INT32
      ty

ERROR:tensorflow:name: "cond/while/cond"
op: "StatelessIf"
input: "cond/while/Greater"
input: "cond/while/loop_body/PartitionedCall/pfor/PartitionedCall"
input: "cond/while/Placeholder_1"
input: "cond/while/strided_slice"
input: "cond/while/strided_slice_1"
input: "cond/while/Placeholder"
input: "cond/while/strided_slice_4"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_while_cond_true_243981"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_while_cond_false_243982"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_VARIANT
      type: DT_VARIANT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_VARIANT
      type: DT_INT32
      ty

ERROR:tensorflow:Got error while pfor was converting op name: "cond/while"
op: "While"
input: "cond/while/loop_counter"
input: "cond/while/maximum_iterations"
input: "cond/TensorArrayV2Write/TensorListSetItem"
input: "cond/TensorArrayV2Write_1/TensorListSetItem"
input: "cond/strided_slice/stroke"
attr {
  key: "parallel_iterations"
  value {
    i: 10
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "cond"
  value {
    func {
      name: "cond_while_cond_243788"
    }
  }
}
attr {
  key: "body"
  value {
    func {
      name: "cond_while_body_243789"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_num_original_outputs"
  value {
    i: 5
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: tr

ERROR:tensorflow:Got error while pfor was converting op name: "cond/while"
op: "While"
input: "cond/while/loop_counter"
input: "cond/while/maximum_iterations"
input: "cond/TensorArrayV2Write/TensorListSetItem"
input: "cond/TensorArrayV2Write_1/TensorListSetItem"
input: "cond/strided_slice/stroke"
attr {
  key: "parallel_iterations"
  value {
    i: 10
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "cond"
  value {
    func {
      name: "cond_while_cond_243788"
    }
  }
}
attr {
  key: "body"
  value {
    func {
      name: "cond_while_body_243789"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_num_original_outputs"
  value {
    i: 5
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: tr

ERROR:tensorflow:name: "cond/while"
op: "While"
input: "cond/while/loop_counter"
input: "cond/while/maximum_iterations"
input: "cond/TensorArrayV2Write/TensorListSetItem"
input: "cond/TensorArrayV2Write_1/TensorListSetItem"
input: "cond/strided_slice/stroke"
attr {
  key: "parallel_iterations"
  value {
    i: 10
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "cond"
  value {
    func {
      name: "cond_while_cond_243788"
    }
  }
}
attr {
  key: "body"
  value {
    func {
      name: "cond_while_body_243789"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_num_original_outputs"
  value {
    i: 5
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "T"
  value {
  

ERROR:tensorflow:name: "cond/while"
op: "While"
input: "cond/while/loop_counter"
input: "cond/while/maximum_iterations"
input: "cond/TensorArrayV2Write/TensorListSetItem"
input: "cond/TensorArrayV2Write_1/TensorListSetItem"
input: "cond/strided_slice/stroke"
attr {
  key: "parallel_iterations"
  value {
    i: 10
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "cond"
  value {
    func {
      name: "cond_while_cond_243788"
    }
  }
}
attr {
  key: "body"
  value {
    func {
      name: "cond_while_body_243789"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_num_original_outputs"
  value {
    i: 5
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "T"
  value {
  

ERROR:tensorflow:Got error while pfor was converting op name: "cond"
op: "If"
input: "Less"
input: "stroke"
input: "strided_slice"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_true_243755"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_false_243756"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_BOOL
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_INT32
    }
  }
}
attr {
  key: "Tcond"
  value {
    type: DT_BOOL
  }
}
 with inputs (<tf.Tensor 'Less:0' shape=() dtype=bool>, <tf.Tensor 'stroke:0' shape=(None, 2) dtype=

ERROR:tensorflow:Got error while pfor was converting op name: "cond"
op: "If"
input: "Less"
input: "stroke"
input: "strided_slice"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_true_243755"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_false_243756"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_BOOL
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_INT32
    }
  }
}
attr {
  key: "Tcond"
  value {
    type: DT_BOOL
  }
}
 with inputs (<tf.Tensor 'Less:0' shape=() dtype=bool>, <tf.Tensor 'stroke:0' shape=(None, 2) dtype=

ERROR:tensorflow:name: "cond"
op: "If"
input: "Less"
input: "stroke"
input: "strided_slice"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_true_243755"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_false_243756"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_BOOL
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_INT32
    }
  }
}
attr {
  key: "Tcond"
  value {
    type: DT_BOOL
  }
}

created at:
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/j

ERROR:tensorflow:name: "cond"
op: "If"
input: "Less"
input: "stroke"
input: "strided_slice"
attr {
  key: "then_branch"
  value {
    func {
      name: "cond_true_243755"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
        dim {
          size: -1
        }
        dim {
          size: 2
        }
      }
    }
  }
}
attr {
  key: "else_branch"
  value {
    func {
      name: "cond_false_243756"
    }
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_lower_using_switch_merge"
  value {
    b: true
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_BOOL
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
      type: DT_INT32
    }
  }
}
attr {
  key: "Tcond"
  value {
    type: DT_BOOL
  }
}

created at:
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/j

ERROR:tensorflow:Got error while pfor was converting op name: "loop_body_1/StatefulPartitionedCall"
op: "StatefulPartitionedCall"
input: "loop_body_1/GatherV2"
attr {
  key: "f"
  value {
    func {
      name: "__inference_douglas_peucker_244074"
    }
  }
}
attr {
  key: "executor_type"
  value {
    s: ""
  }
}
attr {
  key: "config"
  value {
    s: ""
  }
}
attr {
  key: "config_proto"
  value {
    s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001\202\001\000\222\001\002J\000"
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_collective_manager_ids"
  value {
    list {
    }
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
 with inputs (<tf.Tensor 'loop_body_1/GatherV2:0' shape=(None, 2) dtype=float32>,)
, converted inputs [WrappedTensor(t=<tf.Tensor 'loop_body/truediv/pfor/RealDiv:0' shape=(7, None, 2) dt

ERROR:tensorflow:Got error while pfor was converting op name: "loop_body_1/StatefulPartitionedCall"
op: "StatefulPartitionedCall"
input: "loop_body_1/GatherV2"
attr {
  key: "f"
  value {
    func {
      name: "__inference_douglas_peucker_244074"
    }
  }
}
attr {
  key: "executor_type"
  value {
    s: ""
  }
}
attr {
  key: "config"
  value {
    s: ""
  }
}
attr {
  key: "config_proto"
  value {
    s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001\202\001\000\222\001\002J\000"
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_collective_manager_ids"
  value {
    list {
    }
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
 with inputs (<tf.Tensor 'loop_body_1/GatherV2:0' shape=(None, 2) dtype=float32>,)
, converted inputs [WrappedTensor(t=<tf.Tensor 'loop_body/truediv/pfor/RealDiv:0' shape=(7, None, 2) dt

ERROR:tensorflow:name: "loop_body_1/StatefulPartitionedCall"
op: "StatefulPartitionedCall"
input: "loop_body_1/GatherV2"
attr {
  key: "f"
  value {
    func {
      name: "__inference_douglas_peucker_244074"
    }
  }
}
attr {
  key: "executor_type"
  value {
    s: ""
  }
}
attr {
  key: "config"
  value {
    s: ""
  }
}
attr {
  key: "config_proto"
  value {
    s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001\202\001\000\222\001\002J\000"
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_collective_manager_ids"
  value {
    list {
    }
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
    }
  }
}

created at:
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel_launcher.

ERROR:tensorflow:name: "loop_body_1/StatefulPartitionedCall"
op: "StatefulPartitionedCall"
input: "loop_body_1/GatherV2"
attr {
  key: "f"
  value {
    func {
      name: "__inference_douglas_peucker_244074"
    }
  }
}
attr {
  key: "executor_type"
  value {
    s: ""
  }
}
attr {
  key: "config"
  value {
    s: ""
  }
}
attr {
  key: "config_proto"
  value {
    s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001\202\001\000\222\001\002J\000"
  }
}
attr {
  key: "_read_only_resource_inputs"
  value {
    list {
    }
  }
}
attr {
  key: "_collective_manager_ids"
  value {
    list {
    }
  }
}
attr {
  key: "Tout"
  value {
    list {
      type: DT_FLOAT
    }
  }
}
attr {
  key: "Tin"
  value {
    list {
      type: DT_FLOAT
    }
  }
}

created at:
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel_launcher.

ValueError: in user code:

    File "/tmp/ipykernel_5443/798484747.py", line 81, in preprocess_strokes  *
        downsampled_strokes = tf.vectorized_map(elems=normalized_strokes, fn=douglas_peucker)

    ValueError: Shape must be at least rank 1 but is rank 0 for '{{node cond/while/cond/cond/pfor/GatherV2}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_VARIANT, batch_dims=0](args_1, cond/while/cond/cond/pfor/DynamicPartition:1, cond/while/cond/cond/pfor/GatherV2/axis)' with input shapes: [], [?], [].


### Preprocessing datasets

Now, we can go through our datasets a preprocess all the data. We will need both our vectorizer and our stroke preprocessor together. Since we are going with an encoder-decoder model, we need input data for the decoder as well, which should be the desired output, but just missing the last token, and with the start token added in front.

In [34]:
def preprocess_data(data):
    input_strokes = preprocess_strokes(data["strokes"])
    ground_truth = vectorizer([data["ground_truth"].numpy()])
    decoder_input = ground_truth[0][:-1]
    decoder_output = ground_truth[0][1:]
    return (input_strokes, decoder_input), decoder_output

In [35]:
for i, data in enumerate(test.shuffle(200_000).take(5).map(preprocess_data)):
    # pp_data = preprocess_data(data)
    pp_data = data
    print("Decoder input:", token_to_latex(pp_data[0][1]))
    print("True value:", token_to_latex(pp_data[1]))

InaccessibleTensorError: in user code:

    File "/tmp/ipykernel_5443/2758872598.py", line 2, in preprocess_data  *
        input_strokes = preprocess_strokes(data["strokes"])
    File "/tmp/ipykernel_5443/1607123673.py", line 42, in douglas_peucker  *
        sub_stroke = tf.Variable(stroke[start_idx : sub_stroke_end_idx])
    File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/tensorflow/core/function/capture/capture_container.py", line 144, in capture_by_value
        graph._validate_in_scope(tensor)  # pylint: disable=protected-access

    InaccessibleTensorError: <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> is out of scope and cannot be used here. Use return values, explicit Python locals or TensorFlow collections to access it.
    Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.
    
    <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> was defined here:
        File "<frozen runpy>", line 198, in _run_module_as_main
        File "<frozen runpy>", line 88, in _run_code
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start
        File "/usr/lib/python3.12/asyncio/base_events.py", line 641, in run_forever
        File "/usr/lib/python3.12/asyncio/base_events.py", line 1987, in _run_once
        File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
        File "/tmp/ipykernel_5443/554503989.py", line 1, in <module>
        File "/tmp/ipykernel_5443/2758872598.py", line 2, in preprocess_data
        File "/tmp/ipykernel_5443/1607123673.py", line 69, in preprocess_strokes
        File "/tmp/ipykernel_5443/1607123673.py", line 28, in douglas_peucker
        File "/tmp/ipykernel_5443/1607123673.py", line 37, in douglas_peucker
        File "/tmp/ipykernel_5443/1607123673.py", line 51, in douglas_peucker
        File "/tmp/ipykernel_5443/1607123673.py", line 56, in douglas_peucker
        File "/tmp/ipykernel_5443/1607123673.py", line 58, in douglas_peucker
    
    The tensor <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> cannot be accessed from FuncGraph(name=map_1_while_cond_false_175582, id=139776695820736), because it was defined in FuncGraph(name=map_1_while_cond_cond_cond_true_175753, id=139776707101248), which is out of scope.


In [26]:
test = test.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
train = train.map(preprocess_data).shuffle(200_000).batch(32).prefetch(tf.data.AUTOTUNE)
validation = (
    validation.map(preprocess_data)
    .shuffle(200_000)
    .batch(32)
    .prefetch(tf.data.AUTOTUNE)
)

InaccessibleTensorError: in user code:

    File "/tmp/ipykernel_5443/2758872598.py", line 2, in preprocess_data  *
        input_strokes = preprocess_strokes(data["strokes"])
    File "/tmp/ipykernel_5443/1477763246.py", line 41, in douglas_peucker  *
        sub_stroke = stroke[start_idx : end_idx + 1]
    File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/tensorflow/core/function/capture/capture_container.py", line 144, in capture_by_value
        graph._validate_in_scope(tensor)  # pylint: disable=protected-access

    InaccessibleTensorError: <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> is out of scope and cannot be used here. Use return values, explicit Python locals or TensorFlow collections to access it.
    Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.
    
    <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> was defined here:
        File "<frozen runpy>", line 198, in _run_module_as_main
        File "<frozen runpy>", line 88, in _run_code
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start
        File "/usr/lib/python3.12/asyncio/base_events.py", line 641, in run_forever
        File "/usr/lib/python3.12/asyncio/base_events.py", line 1987, in _run_once
        File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
        File "/home/jeshwinprince/Programming/crohme/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
        File "/tmp/ipykernel_5443/1745723387.py", line 1, in <module>
        File "/tmp/ipykernel_5443/2758872598.py", line 2, in preprocess_data
        File "/tmp/ipykernel_5443/1477763246.py", line 68, in preprocess_strokes
        File "/tmp/ipykernel_5443/1477763246.py", line 28, in douglas_peucker
        File "/tmp/ipykernel_5443/1477763246.py", line 37, in douglas_peucker
        File "/tmp/ipykernel_5443/1477763246.py", line 50, in douglas_peucker
        File "/tmp/ipykernel_5443/1477763246.py", line 55, in douglas_peucker
        File "/tmp/ipykernel_5443/1477763246.py", line 57, in douglas_peucker
    
    The tensor <tf.Tensor 'map_1/while/cond/cond/cond/add_1:0' shape=() dtype=int64> cannot be accessed from FuncGraph(name=map_1_while_cond_false_163166, id=139776560381632), because it was defined in FuncGraph(name=map_1_while_cond_cond_cond_true_163315, id=139776560336064), which is out of scope.


Let's test this out to make sure it worked!

In [28]:
print(next(iter(validation.take(1))))

{'filepath': <tf.Tensor: shape=(), dtype=string, numpy=b'/home/jeshwinprince/Programming/crohme/datasets/crohme_dataset/data/INKML/val/CROHME2023_val/form_5_f_205_E1022.inkml'>, 'ground_truth': <tf.Tensor: shape=(), dtype=string, numpy=b'-1.955'>, 'strokes': <tf.RaggedTensor [[[65.75, 78.25],
  [65.75, 78.25],
  [65.75, 78.26],
  [65.76, 78.27],
  [65.78, 78.27],
  [65.81, 78.28],
  [65.83, 78.27],
  [65.85, 78.25],
  [65.88, 78.25],
  [65.91, 78.24],
  [65.96, 78.23],
  [66.03, 78.21],
  [66.16, 78.19],
  [66.36, 78.17],
  [66.61, 78.16],
  [66.88, 78.16],
  [67.18, 78.17],
  [67.45, 78.19],
  [67.72, 78.25],
  [67.96, 78.33],
  [68.16, 78.44]], [[71.34, 80.33],
                    [71.34, 80.33],
                    [71.34, 80.33],
                    [71.35, 80.34],
                    [71.36, 80.35],
                    [71.37, 80.34],
                    [71.38, 80.33],
                    [71.4, 80.31],
                    [71.41, 80.28],
                    [71.44, 80.26],
     

## Model Architecture

My model is an encoder-decoder architecture, with a CNN for the encoder, a feedforward network to get to the latent space, and a LSTM RNN for the decoder.

### Encoder

In [None]:
input_strokes = layers.Input(shape=(64, 64, 2))
x = layers.Conv2D(64, kernel_size=(3, 3), padding="same")(input_strokes)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(128, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(256, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(512, kernel_size=(3, 3), padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)

latent_space = layers.Dense(1024, activation="relu")(x)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)
latent_space = layers.Dense(512, activation="relu")(latent_space)

### Decoder

In [None]:
latent_space_h = layers.Dense(256, activation="relu")(latent_space)
latent_space_c = layers.Dense(256, activation="relu")(latent_space)

decoder_input = layers.Input(shape=(None,))
decoder_embedding = layers.Embedding(input_dim=vectorizer.vocabulary_size(), output_dim=128)(
    decoder_input
)
decoder_lstm = layers.LSTM(128, return_sequences=True)
decoder_output = decoder_lstm(
    decoder_embedding, initial_state=[latent_space_h, latent_space_c]
)

### Output

In [None]:
output = layers.Dense(vectorizer.vocabulary_size(), activation="softmax")(decoder_output)
model = keras.Model([input_strokes, decoder_input], output)
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

## Training the Model

Now we can finally train our model! We have a ton of training and validation data to use. We'll also save the history so we can get a graph of the change in loss over each epoch.

In [None]:
history = model.fit(
    train,
    validation_data=validation,
    epochs=20,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint("model_checkpoint.h5", save_best_only=True),
    ],
)