In [1]:
#download the dataset
import kagglehub
path = kagglehub.dataset_download("ntcuong2103/crohme2019")
path

'/kaggle/input/crohme2019'

In [None]:
# authored by me for parsing and rendering InkML handwriting data

import os
import xml.etree.ElementTree as ET
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image, display
from glob import glob

# parse an InkML file and optionally save the rendered trace image
def parse_and_render_inkml(file_path, save_path=None):
    """
    Parses an InkML (.inkml) file, extracts stroke data (traces),
    renders it as a handwriting image using matplotlib,
    and optionally saves the image to a file.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    traces = []
    # extract all <trace> elements
    for trace in root.findall(".//{http://www.w3.org/2003/InkML}trace"):
        raw_points = trace.text.strip().split(',')
        stroke = []
        for point in raw_points:
            coords = point.strip().split()
            if len(coords) == 2:
                stroke.append([float(coords[0]), float(coords[1])])
        if stroke:
            traces.append(np.array(stroke))

    # create a figure and draw the strokes
    fig, ax = plt.subplots()
    for stroke in traces:
        ax.plot(stroke[:, 0], -stroke[:, 1], linewidth=2)  # flip y-axis

    ax.axis('off')
    ax.set_aspect('equal')

    # save the figure if path provided, otherwise show it
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close(fig)
    else:
        plt.show()

# explore folder contents and count files with optional extension
def explore_directory(dir_path, extension=None):
    print(f"Exploring directory: {dir_path}\n")
    if not os.path.exists(dir_path):
        print("Path does not exist.")
        return

    subdirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))]
    files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]

    print(f"Subfolders: {len(subdirs)}")
    for d in subdirs:
        print("  └──", d)

    if extension:
        matching = [f for f in files if f.endswith(extension)]
        print(f"\n .{extension} files: {len(matching)}")
    else:
        print(f"\n Total files: {len(files)}")

# check contents of all relevant data folders
explore_directory("/kaggle/input/crohme2019/crohme2019/crohme2019/test/", ".inkml")
explore_directory("/kaggle/input/crohme2019/crohme2019/crohme2019/train/", ".inkml")
explore_directory("/kaggle/input/crohme2019/crohme2019/crohme2019/valid/", ".inkml")


Exploring directory: /kaggle/input/crohme2019/crohme2019/crohme2019/test/

Subfolders: 0

 ..inkml files: 1199
Exploring directory: /kaggle/input/crohme2019/crohme2019/crohme2019/train/

Subfolders: 0

 ..inkml files: 8901
Exploring directory: /kaggle/input/crohme2019/crohme2019/crohme2019/valid/

Subfolders: 0

 ..inkml files: 986


In [None]:
# parsing label file and saving structured labels to CSV
import pandas as pd

# this is the path to the ground-truth label file
label_file_path = "/kaggle/input/crohme2019/crohme2019_train.txt"

# list to hold parsed (file_path, label_text) pairs
label_entries = []

# read each line from the label file and split into components
with open(label_file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            parts = line.split(maxsplit=1)  # split into path and label
            if len(parts) == 2:
                label_entries.append(parts)

# convert parsed data into a DataFrame
df = pd.DataFrame(label_entries, columns=["file_path", "label"])

# extract just the file name from the path (for easier matching later)
df["file_name"] = df["file_path"].apply(lambda x: x.split("/")[-1])

# tokenize label string into list of tokens
df["tokens"] = df["label"].str.split()

# export the DataFrame to a CSV file for later use
df.to_csv("/kaggle/working/train_labels.csv", index=False)
print(" Label data saved to train_labels.csv")

Label data saved to train_labels.csv


In [None]:
# parsing label file and saving structured labels to CSV
import pandas as pd

# this is the path to the ground-truth label file for test set
label_file_path = "/content/crohme2019_test.txt"

# list to hold parsed (file_path, label_text) pairs
label_entries = []

# read each line from the label file and split into components
with open(label_file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            parts = line.split(maxsplit=1)  # split into path and label
            if len(parts) == 2:
                label_entries.append(parts)

# convert parsed data into a DataFrame
df = pd.DataFrame(label_entries, columns=["file_path", "label"])

# extract just the file name from the path (for easier matching later)
df["file_name"] = df["file_path"].apply(lambda x: x.split("/")[-1])

# tokenize label string into list of tokens
df["tokens"] = df["label"].str.split()

# export the DataFrame to a CSV file for later use
df.to_csv("test_labels.csv", index=False)
print(" Label data saved to test_labels.csv")

Label data saved to test_labels.csv


In [None]:
# parsing label file and saving structured labels to CSV
import pandas as pd

# this is the path to the ground-truth label file for validation set
label_file_path = "/content/crohme2019_valid.txt"

# list to hold parsed (file_path, label_text) pairs
label_entries = []

# read each line from the label file and split into components
with open(label_file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            parts = line.split(maxsplit=1)  # split into path and label
            if len(parts) == 2:
                label_entries.append(parts)

# convert parsed data into a DataFrame
df = pd.DataFrame(label_entries, columns=["file_path", "label"])

# extract just the file name from the path (for easier matching later)
df["file_name"] = df["file_path"].apply(lambda x: x.split("/")[-1])

# tokenize label string into list of tokens
df["tokens"] = df["label"].str.split()

# export the DataFrame to a CSV file for later use
df.to_csv("valid_labels.csv", index=False)
print(" Label data saved to valid_labels.csv")

Label data saved to valid_labels.csv


In [3]:
import pandas as pd
from collections import Counter
import json

# load the CSV file containing training labels and their tokenized math expressions
train_df = pd.read_csv("/content/train_labels.csv")

# in case the token column doesn't exist, generate it from the label string
if "tokens" not in train_df.columns:
    train_df["tokens"] = train_df["label"].astype(str).str.split()

# flatten all tokens across all samples into a single list
# this allows us to find all unique tokens used in the training data
all_tokens = [token for tokens in train_df["tokens"] for token in tokens if isinstance(token, str)]

# count frequency of each token (can be useful for analysis)
token_counts = Counter(all_tokens)

# define special tokens used by the model (padding, start/end, unknown)
special_tokens = ["<PAD>", "<SOS>", "<EOS>", "<UNK>"]

# collect all unique tokens from the dataset and sort them alphabetically
unique_tokens = sorted(set(all_tokens))

# construct the final vocabulary list: special tokens first, then all unique tokens
vocab = special_tokens + unique_tokens

# create a dictionary that maps each token to a unique integer ID
token_to_id = {token: idx for idx, token in enumerate(vocab)}

# create the reverse mapping: ID back to token (used during decoding)
id_to_token = {idx: token for token, idx in token_to_id.items()}

# export the token-to-ID mapping to a JSON file for later use during training
with open("token_to_id.json", "w", encoding="utf-8") as f:
    json.dump(token_to_id, f, indent=2, ensure_ascii=False)

# also export the reverse mapping (optional but useful for evaluation/inference)
with open("id_to_token.json", "w", encoding="utf-8") as f:
    json.dump(id_to_token, f, indent=2, ensure_ascii=False)

# confirmation message
print(f" Vocabulary saved! Total tokens: {len(vocab)}")


 Vocabulary saved! Total tokens: 77
