In [None]:

import requests
import re
from pathlib import Path
from datasets import load_dataset
import tokenizers
import json
import pandas as pd
from typstscribe.const import TOKENIZER_MODELS_DIR, TOKENS_DIR

In [None]:
TOKENIZER_MODELS_DIR.mkdir(parents=True, exist_ok=True)
TYPST_SYNTAX_DIR = TOKENS_DIR / "typst_syntax"

VOCAB_SIZE = 4096

In [63]:
# Get codex sym tokens
CODEX_SYM_TXT_URL = "https://raw.githubusercontent.com/typst/codex/21b1cd06f61bc90bae32780297f82282c759ccc9/src/modules/sym.txt"
response = requests.get(CODEX_SYM_TXT_URL)

assert response.status_code == 200

In [64]:
def parse_symbol_file(text):
    """
    Parses a symbol definition file and yields a list of dot-notation identifiers.

    Args:
        text (str): The raw content of the symbol file.

    Returns:
        list[str]: A list of identifier strings (e.g., 'space.nobreak').
    """
    lines = text.splitlines()
    cleaned_tokens = []

    for line in lines:
        # Strip comments (everything after //)
        line = line.split('//')[0].strip()
        if line:
            # Add tokens from this line
            cleaned_tokens.extend(line.split())

    results = []
    current_root = None

    # Regex to identify a valid Key (identifier)
    # Must contain letters or dots, and must have at least one letter.
    # This avoids matching "..." or purely numeric/symbolic tokens.
    key_pattern = re.compile(r'^(?=.*[a-zA-Z])[\.a-zA-Z]+$')

    i = 0
    while i < len(cleaned_tokens):
        token = cleaned_tokens[i]

        # Check if the token is a Key (Identifier)
        if key_pattern.match(token):

            if token.startswith('.'):
                # CASE 1: Child Symbol (e.g., .nobreak)
                # It appends to the last known root.
                if current_root:
                    full_name = f"{current_root}{token}"
                    results.append(full_name)
                # Children in this format always have values, so we implicitly skip the value next loop

            else:
                # CASE 2: New Root (e.g., space, paren)
                current_root = token

                # We need to peek ahead to see if this Root is a Symbol or just a Namespace.
                # If the NEXT token is NOT a Key, it must be a Value.
                # If it is a Value, then this Root is a valid symbol itself (e.g., space \u{20}).
                # If the next token IS a Key (or end of file), this Root is just a container (e.g., paren).

                is_symbol = False
                if i + 1 < len(cleaned_tokens):
                    next_token = cleaned_tokens[i+1]
                    # If next token is NOT a key, it's a value
                    if not key_pattern.match(next_token):
                        is_symbol = True

                if is_symbol:
                    results.append(current_root)

        else:
            # Token is a Value (e.g., \u{2060}, (, ⟮, etc.) or garbage.
            # We ignore values as we only want the identifiers.
            pass

        i += 1

    return results

codex_sym_symbols = parse_symbol_file(response.text)

In [65]:
# Now parse manual symbols
def parse_manual_file(text):
    """
    Parses a manual symbol definition file and yields a list of dot-notation identifiers.

    Args:
        text (str): The raw content of the manual symbol file.

    Returns:
        list[str]: A list of identifier strings (e.g., 'math.styles.upright').
    """
    lines = text.splitlines()
    lines = [l for l in lines if not l.startswith("//")]  # Remove lines with comments
    lines = [l for l in lines if l.strip() != ""]  # Remove empty lines

    unique_lines = set(lines)
    return list(unique_lines)

manual_symbols = []
for filename in TYPST_SYNTAX_DIR.glob("manual/*.txt"):
    with open(filename) as f:
        file_content = f.read()
        manual_symbols.extend(parse_manual_file(file_content))



In [66]:
raw_typst_tokens = list(set(codex_sym_symbols + manual_symbols))
typst_tokens = []

# Add versions with space prepended for efficiency
UNICODE_SPACE_CHAR = "\u0120"  # Ġ
for token in raw_typst_tokens:
    typst_tokens.append(token)
    typst_tokens.append(UNICODE_SPACE_CHAR + token)

print(len(typst_tokens))


2628


In [67]:
# Get dataset for tokenizer
ds = load_dataset(
    "JeppeKlitgaard/typst-image-dataset",
    data_files="metadata.parquet",
)["train"]["typst"][:100000]

In [68]:
# These come from RoBERTa
ARCHITECTURE_TOKENS = {
    "pad_token": "<pad>",
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "mask_token": "<mask>"
}

In [69]:
all_tokens = list(ARCHITECTURE_TOKENS.values()) + typst_tokens

In [70]:
ADD_PREFIX_SPACE = False
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token=ARCHITECTURE_TOKENS["unk_token"]))

# Pretokenization
# ByteLevel encoding to ensure all characters are representable
# Split digits into isolated tokens (e.g., "2024" -> "2", "0", "2", "4")
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
    tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=ADD_PREFIX_SPACE, use_regex=False),  # Add ByteLevel to prevent <unk> tokens
    tokenizers.pre_tokenizers.Digits(individual_digits=True),  # Split digits
])
# tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

# Train tokeniser
trainer = tokenizers.trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE,
    special_tokens=all_tokens,
    initial_alphabet=tokenizers.pre_tokenizers.ByteLevel.alphabet(),
    show_progress=True,
)
BATCH_SIZE = 10_000
def batch_iterator():
    for i in range(0, len(ds), BATCH_SIZE):
        yield ds[i : i + BATCH_SIZE]

tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(ds))

# Actually this breaks with current Typst math syntax (f_a(x) ≠ f_a (x))
# bracket_like = ["(", "[", "{", ")", "]", "}"]
# for bracket in bracket_like:
#     tokenizer.add_tokens([
#         tokenizers.AddedToken(
#             content=bracket,
#             single_word=False,
#             special=False,
#             # lstrip=True,
#             # rstrip=True,
#         )
#     ])

# Decoder
tokenizer.decoder = tokenizers.decoders.ByteLevel()

eos_id = tokenizer.token_to_id(ARCHITECTURE_TOKENS["eos_token"])
bos_id = tokenizer.token_to_id(ARCHITECTURE_TOKENS["bos_token"])

tokenizer.post_processor = tokenizers.processors.RobertaProcessing(
    (ARCHITECTURE_TOKENS["eos_token"], eos_id),
    (ARCHITECTURE_TOKENS["bos_token"], bos_id),
    trim_offsets=True,
    add_prefix_space=ADD_PREFIX_SPACE,
)







In [71]:
print("Normalizer:", tokenizer.normalizer)

Normalizer: None


In [72]:
# Save tokenizer to JSON string
tokenizer_json = tokenizer.to_str()
config = json.loads(tokenizer_json)

# Convert to set for faster lookup
typst_set = set(typst_tokens)

# Iterate through the existing list
for item in config.get("added_tokens", []):
    if item["content"] in typst_set:
        # Unmark as special
        item["single_word"] = True
        item["special"] = False

# Reload tokenizer from modified config
tokenizer = tokenizers.Tokenizer.from_str(json.dumps(config))

In [73]:
# Save final tokenizer
TOKENIZER_DIR = TYPST_SYNTAX_DIR / "tokenizer"
TOKENIZER_DIR.mkdir(exist_ok=True)
tokenizer.save(str(TOKENIZER_DIR / "typst_tokenizer.json"))

In [74]:
# Test it out on some examples
for i in range(5):
    example = ds[i]
    encoded = tokenizer.encode(example)
    decoded = tokenizer.decode(encoded.ids, skip_special_tokens=True)
    print("Original: ", example)
    print("Encoded:  ", encoded.ids)
    print("Decoded:  ", decoded)
    print()

Original:  Lambda _ ( W ) ^ ( ( 0 ) ) ( zeta ; r ) = 1 , wide Lambda _ ( W ) ^ ( ( 1 ) ) ( zeta ; r ) = Lambda ( zeta ; r ) = ( Theta _ ( q ^ ( 4 ) ) ( r zeta ^ ( 2 ) ) Theta _ ( q ^ ( 4 ) ) ( q ^ ( 2 ) r zeta ^ ( - 2 ) ) ) / ( Theta _ ( q ^ ( 4 ) ) ( r zeta ^ ( - 2 ) ) Theta _ ( q ^ ( 4 ) ) ( q ^ ( 2 ) r zeta ^ ( 2 ) ) ) zeta ^ ( - 2 ) ,
Encoded:   [1, 691, 2864, 2662, 3044, 1009, 2899, 1035, 2823, 921, 2999, 2907, 681, 2823, 1579, 3586, 2823, 691, 2864, 2662, 3044, 681, 2899, 1035, 2823, 921, 2999, 2907, 691, 2861, 1035, 2823, 921, 2999, 2985, 2107, 2864, 3273, 1835, 2899, 2687, 2823, 1035, 2866, 2395, 2871, 2107, 2864, 3273, 1835, 2899, 3273, 2395, 3178, 2823, 1035, 2866, 707, 2823, 2395, 3074, 2107, 2864, 3273, 1835, 2899, 2687, 2823, 1035, 2866, 707, 2823, 2395, 2871, 2107, 2864, 3273, 1835, 2899, 3273, 2395, 3178, 2823, 1035, 2866, 2395, 2991, 1035, 2866, 707, 2823, 2395, 2862, 1579, 2]
Decoded:   Lambda _ ( W ) ^ ( ( 0 ) ) ( zeta ; r ) = 1 , wide Lambda _ ( W ) ^ ( ( 1 ) ) ( zet

In [75]:

from transformers import PreTrainedTokenizerFast

# Wrap the raw tokenizer
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token=ARCHITECTURE_TOKENS["bos_token"],
    eos_token=ARCHITECTURE_TOKENS["eos_token"],
    unk_token=ARCHITECTURE_TOKENS["unk_token"],
    pad_token=ARCHITECTURE_TOKENS["pad_token"],
    mask_token=ARCHITECTURE_TOKENS["mask_token"],
    cls_token=ARCHITECTURE_TOKENS["bos_token"],
    sep_token=ARCHITECTURE_TOKENS["eos_token"],
    clean_up_tokenization_spaces=True
)


In [76]:
SAMPLE_TEXT = """underbrace(f(theta), "obj")"""

wrapped_tokenizer.tokenize(SAMPLE_TEXT)

['underbrace', '(', 'f', '(', 'theta', ')', ',', 'Ġ"', 'o', 'b', 'j', '"', ')']

In [79]:
for i in range(5):
    example = ds[i]
    encoded = tokenizer.encode(example)
    tokenized = wrapped_tokenizer.tokenize(example)
    decoded = tokenizer.decode(encoded.ids, skip_special_tokens=True)
    print("Original: ", example)
    print("Encoded:  ", encoded.ids)
    print("Tokenized: ", tokenized)
    print("Decoded:  ", decoded)
    print()

Original:  Lambda _ ( W ) ^ ( ( 0 ) ) ( zeta ; r ) = 1 , wide Lambda _ ( W ) ^ ( ( 1 ) ) ( zeta ; r ) = Lambda ( zeta ; r ) = ( Theta _ ( q ^ ( 4 ) ) ( r zeta ^ ( 2 ) ) Theta _ ( q ^ ( 4 ) ) ( q ^ ( 2 ) r zeta ^ ( - 2 ) ) ) / ( Theta _ ( q ^ ( 4 ) ) ( r zeta ^ ( - 2 ) ) Theta _ ( q ^ ( 4 ) ) ( q ^ ( 2 ) r zeta ^ ( 2 ) ) ) zeta ^ ( - 2 ) ,
Encoded:   [1, 691, 2864, 2662, 3044, 1009, 2899, 1035, 2823, 921, 2999, 2907, 681, 2823, 1579, 3586, 2823, 691, 2864, 2662, 3044, 681, 2899, 1035, 2823, 921, 2999, 2907, 691, 2861, 1035, 2823, 921, 2999, 2985, 2107, 2864, 3273, 1835, 2899, 2687, 2823, 1035, 2866, 2395, 2871, 2107, 2864, 3273, 1835, 2899, 3273, 2395, 3178, 2823, 1035, 2866, 707, 2823, 2395, 3074, 2107, 2864, 3273, 1835, 2899, 2687, 2823, 1035, 2866, 707, 2823, 2395, 2871, 2107, 2864, 3273, 1835, 2899, 3273, 2395, 3178, 2823, 1035, 2866, 2395, 2991, 1035, 2866, 707, 2823, 2395, 2862, 1579, 2]
Tokenized:  ['Lambda', 'Ġ_Ġ(Ġ', 'W', 'Ġ)Ġ^Ġ(Ġ(Ġ', '0', 'Ġ)Ġ)Ġ(Ġ', 'zeta', 'Ġ', ';', 'Ġr', 'Ġ)Ġ

In [78]:
wrapped_tokenizer.save_pretrained(TOKENIZER_MODELS_DIR / "typst_tokenizer")

('/home/jkl/Code/DTU-02456-Deep-Learning-Project/models/tokenizers/typst_tokenizer/tokenizer_config.json',
 '/home/jkl/Code/DTU-02456-Deep-Learning-Project/models/tokenizers/typst_tokenizer/special_tokens_map.json',
 '/home/jkl/Code/DTU-02456-Deep-Learning-Project/models/tokenizers/typst_tokenizer/tokenizer.json')

In [None]:
# wrapped_tokenizer.push_to_hub("JeppeKlitgaard/typst-tokenizer")