In [1]:
from pathlib import Path
from huggingface_hub import hf_hub_download
import pandas as pd
import rarfile
import webdataset as wds
from tqdm.auto import tqdm
import json

In [2]:
HUANG_DATASET_REPO_ID = "hoang-quoc-trung/fusion-image-to-latex-datasets"

THIS_FOLDER = Path().resolve()
DATA_FOLDER = THIS_FOLDER / "data"
DATA_FOLDER.mkdir(exist_ok=True, parents=True)
TEST_FOLDER = DATA_FOLDER / "test"
TEST_FOLDER.mkdir(exist_ok=True, parents=True)
TRAIN_FOLDER = DATA_FOLDER / "train"
TRAIN_FOLDER.mkdir(exist_ok=True, parents=True)
VALIDATION_FOLDER = DATA_FOLDER / "validation"
VALIDATION_FOLDER.mkdir(exist_ok=True, parents=True)

In [3]:
# Configure rarfile
rarfile.UNRAR_TOOL = str(THIS_FOLDER / "UnRAR.exe")


In [4]:
# Download images
images_rar_path = Path(hf_hub_download(
    repo_id=HUANG_DATASET_REPO_ID,
    filename="root.rar",
    repo_type="dataset"
)).resolve()

assert images_rar_path.exists()
images_rar_path


WindowsPath('C:/Users/jeppe/.cache/huggingface/hub/datasets--hoang-quoc-trung--fusion-image-to-latex-datasets/blobs/afe2298da7eb1bc4410818ca4386331f8c354978f8e4b6fb62a850c0b8e28ed2')

In [5]:
# Load metadata
df = pd.read_parquet(DATA_FOLDER / "metadata.parquet")
df = df.set_index("image_filename")

In [None]:
rf = rarfile.RarFile(images_rar_path)

In [None]:
# Debug: Dump filenames
filenames = rf.namelist()
with open(DATA_FOLDER / "filenames.txt", "w") as f:
    for filename in filenames:
        f.write(f"{filename}\n")

In [None]:
# Since random access is horrible in RAR files, we let the RAR file dictate the order of the images
shard_writer_args = {
    "maxcount": 10_000,
    "maxsize": 1e9,
}

skip_count = 0

with (
    # rarfile.RarFile(images_rar_path) as rf,
    wds.ShardWriter("data/test/shard_%06d.tar", **shard_writer_args) as sink_test,
    wds.ShardWriter("data/train/shard_%06d.tar", **shard_writer_args) as sink_train,
    wds.ShardWriter("data/validation/shard_%06d.tar", **shard_writer_args) as sink_val,

):
    infolist = rf.infolist()

    for info in tqdm(infolist, desc="Processing images"):
        filename = info.filename

        # Fix: corrupted filenames from original dataset
        filename = filename.removeprefix("images/")

        if filename != "0000b55567e8c74_basic.png":
            filename = filename.removeprefix("0000b55567e8c74_basic.png")

        if filename == "/":
            continue

        key = Path(filename).stem
        image_extension = Path(filename).suffix.lstrip(".").lower()

        if filename not in df.index:
            skip_count += 1
            print(f"Skipping {filename} as it is not in the metadata (skip: {skip_count})")
            continue

        image_data = rf.read(info)
        metadata_entry = df.loc[filename]

        metadata = {
            "image_type": metadata_entry["image_type"],
        }

        sample = {
            "__key__": key,
            f"image.{image_extension}": image_data,
            "latex.txt": metadata_entry["latex"],
            "typst.txt": metadata_entry["typst"],
            "metadata.json": json.dumps(metadata).encode("utf-8"),
        }

        if metadata_entry["split"] == "train":
            sink_train.write(sample)
        elif metadata_entry["split"] == "validation":
            sink_val.write(sample)
        elif metadata_entry["split"] == "test":
            sink_test.write(sample)
        else:
            print(f"Unknown split for {filename}: {metadata_entry['split']}")
            continue


# writing data/test/shard_000000.tar 0 0.0 GB 0
# writing data/train/shard_000000.tar 0 0.0 GB 0
# writing data/validation/shard_000000.tar 0 0.0 GB 0


Processing images:   0%|          | 0/3669584 [00:00<?, ?it/s]

KeyError: 'latex'