In [1]:
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
from transformers import GPT2TokenizerFast
from tqdm import tqdm
from datasets import load_dataset, Value, Dataset, Sequence
import os
import sys
from huggingface_hub import login, HfApi


In [2]:

# 1. Load the tokenizer
# We use the "Fast" version for significantly better performance on 1M rows
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
print(f"Number of distinct tokens in tokenizer: {tokenizer.vocab_size} ")
assert tokenizer.vocab_size < 2**16, "Tokenizer vocab size exceeds uint16 capacity."

TARGET_TOKEN_LENGTH = 8192
# TARGET_ROWS = 1_000_000
TARGET_ROWS = 1_00
HF_USERNAME = "MikiV"

# Assert HF write token in env or input


INPUT_DATASET_ID = "openbmb/Ultra-FineWeb"
OUTPUT_DATASET_NAME = f"{HF_USERNAME}/{INPUT_DATASET_ID.split('/')[-1]}-chunked-{TARGET_TOKEN_LENGTH}"
print(f"Final dataset will have approximately {TARGET_ROWS * TARGET_TOKEN_LENGTH * 2 / 1e9}GB.")

Number of distinct tokens in tokenizer: 50257 
Final dataset will have approximately 0.0016384GB.


In [3]:

# Suppress tokenization warnings (e.g., token indices sequence length is longer than the specified maximum)
# We want the true document length, not truncated to 1024
tokenizer.model_max_length = 1e9 

# 2. Load the dataset in streaming mode
# "openbmb/Ultra-FineWeb" is massive, so streaming=True prevents downloading TBs of data.
# We explicitly select the 'train' split.
streaming_dataset = load_dataset(INPUT_DATASET_ID, split="en", streaming=True)
output_rows = []

pbar = tqdm(total=TARGET_ROWS, desc="Collecting Valid Samples", unit="rows")
for row in streaming_dataset:
    text = row["content"]
    tokenized = tokenizer(text)
    input_ids = tokenized["input_ids"]
    if len(input_ids) >= TARGET_TOKEN_LENGTH:
        input_ids = input_ids[:TARGET_TOKEN_LENGTH]
        output_rows.append({"input_ids": input_ids})
        pbar.update(1)
    if len(output_rows) >= TARGET_ROWS:
        break

Resolving data files:   0%|          | 0/2048 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/256 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2048 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/256 [00:00<?, ?it/s]

Collecting Valid Samples: 100%|██████████| 100/100 [01:33<00:00,  6.68rows/s]

In [4]:
output_ds = Dataset.from_list(output_rows)

output_ds = output_ds.cast_column("input_ids", Sequence(Value("uint16"), length=TARGET_TOKEN_LENGTH))
output_ds.push_to_hub(OUTPUT_DATASET_NAME, private=False)

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/294 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/MikiV/Ultra-FineWeb-chunked-8192/commit/ec7894057237ed1ef247851d40c8ed9bb534db69', commit_message='Upload dataset', commit_description='', oid='ec7894057237ed1ef247851d40c8ed9bb534db69', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MikiV/Ultra-FineWeb-chunked-8192', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MikiV/Ultra-FineWeb-chunked-8192'), pr_revision=None, pr_num=None)