In [1]:
from convert import latex_to_typst_paranexus_tex2typ_batch
from huggingface_hub import hf_hub_download
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from common import HUANG_DATASET_REPO_ID, RAW_METADATA_PARQUET_FILE_PATH

TODO: Maybe implement `typstyle`, though I don't think it does much in math mode, so safe to omit for now

In [2]:
RAW_METADATA_PARQUET_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)

# Step 1: Metadata

In [3]:
METADATA_FILES = [
    {
        "archive_path": "train/handwritten_mathematical_expressions_train.csv",
        "split": "train",
        "image_type": "handwritten",
    },
    {
        "archive_path": "train/printed_mathematical_expressions_train.csv",
        "split": "train",
        "image_type": "printed",
    },
    {
        "archive_path": "test/crohme_handwritten_2014_test.csv",
        "split": "test",
        "image_type": "handwritten",
    },
    {
        "archive_path": "test/crohme_handwritten_2016_test.csv",
        "split": "test",
        "image_type": "handwritten",
    },
    {
        "archive_path": "test/crohme_handwritten_2019_test.csv",
        "split": "test",
        "image_type": "handwritten",
    },
    {
        "archive_path": "test/img2latex_100k_handwritten_test.csv",
        "split": "test",
        "image_type": "handwritten",
    },
    {
        "archive_path": "test/img2latex_100k_test.csv",
        "split": "test",
        "image_type": "printed",
    },
    {
        "archive_path": "val/handwritten_mathematical_expressions_val.csv",
        "split": "validation",
        "image_type": "handwritten",
    },
    {
        "archive_path": "val/printed_mathematical_expressions_val.csv",
        "split": "validation",
        "image_type": "printed",
    },
]

dfs = []
for metadata_file in METADATA_FILES:
    csv_path = Path(hf_hub_download(
        repo_id=HUANG_DATASET_REPO_ID,
        filename=metadata_file["archive_path"],
        repo_type="dataset"
    )).resolve()

    assert csv_path.exists()

    df = pd.read_csv(csv_path)
    df["split"] = metadata_file["split"]
    df["image_type"] = metadata_file["image_type"]
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df.head(5)

Unnamed: 0,image_filename,latex,split,image_type
0,39928.png,\Lambda _ { W } ^ { ( 0 ) } ( \zeta ; r ) = 1 ...,train,handwritten
1,54368.png,{ \hat { \delta } } { { \overline { { \eta } }...,train,handwritten
2,83904690-93db-42f1-81d0-d60cb7981cf9.jpg,\operatorname* { l i m } _ { w \to 9 ^ { + } }...,train,handwritten
3,132b8135-e8e6-45d4-a2dc-2596f9e95e84.jpg,\operatorname* { l i m } _ { s \to \infty } \f...,train,handwritten
4,41328712-67d5-4692-8577-03077eff4ea7.jpg,\operatorname* { l i m } _ { v \to 2 } \frac {...,train,handwritten


In [4]:
# Convert to Typst
batch_size = 1000
typst_results = []
# Process in batches with progress bar
for i in tqdm(range(0, len(df), batch_size), desc="Converting LaTeX to Typst"):
    batch_end = min(i + batch_size, len(df))
    batch_latex = df.iloc[i:batch_end]["latex"].tolist()

    # Convert batch
    batch_typst = latex_to_typst_paranexus_tex2typ_batch(batch_latex)
    typst_results.extend(batch_typst)

# Add as new column
df["typst"] = typst_results

# Print statistics
total = len(typst_results)
failed = sum(1 for x in typst_results if x.startswith('CONVERT_ERROR:'))
successful = total - failed

print("\nConversion complete:")
print(f"  Total: {total}")
print(f"  Successful: {successful} ({100*successful/total:.1f}%)")
print(f"  Failed: {failed} ({100*failed/total:.1f}%)")


Converting LaTeX to Typst:   0%|          | 0/3464 [00:00<?, ?it/s]


Conversion complete:
  Total: 3463140
  Successful: 3326329 (96.0%)
  Failed: 136811 (4.0%)


In [5]:
# Remove failed conversions
df = df[~df["typst"].str.startswith('CONVERT_ERROR:')].reset_index(drop=True)

# Save dataset
df.to_parquet(RAW_METADATA_PARQUET_FILE_PATH, index=False)

In [6]:
df

Unnamed: 0,image_filename,latex,split,image_type,typst
0,39928.png,\Lambda _ { W } ^ { ( 0 ) } ( \zeta ; r ) = 1 ...,train,handwritten,"Lambda _ ( W ) ^ ( ( 0 ) ) ( zeta ; r ) = 1 , ..."
1,54368.png,{ \hat { \delta } } { { \overline { { \eta } }...,train,handwritten,hat( delta ) overline( eta ) ^ ( a ) = - parti...
2,83904690-93db-42f1-81d0-d60cb7981cf9.jpg,\operatorname* { l i m } _ { w \to 9 ^ { + } }...,train,handwritten,lim _ ( w -> 9 ^ ( + ) ) ( 5 + - tan ^ ( 0 ) w...
3,132b8135-e8e6-45d4-a2dc-2596f9e95e84.jpg,\operatorname* { l i m } _ { s \to \infty } \f...,train,handwritten,lim _ ( s -> oo ) ( 92 ln s ) / ( sqrt( x ) )
4,41328712-67d5-4692-8577-03077eff4ea7.jpg,\operatorname* { l i m } _ { v \to 2 } \frac {...,train,handwritten,lim _ ( v -> 2 ) ( 9 + tan v ) / ( 8 + - 5 cos...
...,...,...,...,...,...
3326324,d804ed875abf3ef_basic.png,\mathcal { D } [ u ] ^ { * } = \mathcal { D } ...,validation,printed,cal( D ) [ u ] ^ ( * ) = cal( D ) [ u ]
3326325,9923cc22d8870d6_basic.png,\frac { 1 } { 2 } \left( 1 - \sqrt { 1 - \sin ...,validation,printed,( 1 ) / ( 2 ) ( 1 - sqrt( 1 - sin ^ ( 2 ) 2 th...
3326326,sume_data-00005-of-00009_65498.png,p _ { 1 } = P _ { 1 } \frac { \eta _ { s } } {...,validation,printed,p _ ( 1 ) = P _ ( 1 ) ( eta _ ( s ) ) / ( 2 )
3326327,4d7a979f491f0ad.png,S _ { \mathrm { k i n e t i c } } = \frac { i ...,validation,printed,"S _ ( ""kinetic"" ) = ( i ) / ( 2 ) integral d ^..."
