In [1]:
from datasets import load_dataset, Dataset, concatenate_datasets

# Normalization function with meta source info
def normalize_to_text_with_meta(dataset, text_fields, source_name):
    def extract_text(example):
        texts = [str(example[field]) for field in text_fields if field in example and example[field]]
        return {
            'text': '\n\n'.join(texts).strip(),
            'source': source_name
        }
    return dataset.map(extract_text, remove_columns=dataset.column_names)

datasets_to_merge = []

In [2]:
# milkshake721/2.1M-wiki-STEM
wiki_stem = load_dataset("milkshake721/2.1M-wiki-STEM", split="train")
wiki_stem = normalize_to_text_with_meta(wiki_stem, ["title", "section", "text"], "https://huggingface.co/datasets/milkshake721/2.1M-wiki-STEM")
datasets_to_merge.append(wiki_stem)

Map:   0%|          | 0/2101279 [00:00<?, ? examples/s]

In [3]:
# izumi-lab/open-text-books
open_text_books = load_dataset("izumi-lab/open-text-books", split="train")
open_text_books = normalize_to_text_with_meta(open_text_books, ["text"], "https://huggingface.co/datasets/izumi-lab/open-text-books")
datasets_to_merge.append(open_text_books)

Map:   0%|          | 0/149700 [00:00<?, ? examples/s]

In [4]:
def normalize_se_to_text_with_meta(dataset, source_name):
    def extract_text(example):
        return {
            'text': "Q:\n\n" + example["title_body"] + "\n\nA:\n\n" + example["upvoted_answer"],
            'source': source_name
        }
    return dataset.map(extract_text, remove_columns=dataset.column_names)
    
# flax-sentence-embeddings/stackexchange_math_jsonl
stackexchange_math = load_dataset("flax-sentence-embeddings/stackexchange_math_jsonl", split="train", name="titlebody_answer")
stackexchange_math = normalize_se_to_text_with_meta(stackexchange_math, "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_math_jsonl")
datasets_to_merge.append(stackexchange_math)

Map:   0%|          | 0/1100953 [00:00<?, ? examples/s]

In [5]:
import sys
from datasets import Dataset

def estimate_dataset_size_mb(dataset: Dataset):
    """
    Estimate the dataset size in MB by calculating the average text size.
    """
    sample_size = min(1000, len(dataset))
    sample_texts = dataset.select(range(sample_size))["text"]

    total_chars = sum(len(text) for text in sample_texts)
    avg_chars_per_doc = total_chars / sample_size

    total_chars_all = avg_chars_per_doc * len(dataset)
    total_bytes = total_chars_all  # Assuming 1 char = 1 byte (ASCII-heavy text)

    size_mb = total_bytes / (1024 * 1024)
    return size_mb

# Assuming datasets_to_merge is your list of datasets with {'text': ..., 'meta': ...}
for ds in datasets_to_merge:
    source_name = ds[0]['source']  # Safe as all in ds share the same source meta
    size_mb = estimate_dataset_size_mb(ds)
    print(f"Dataset: {source_name.ljust(25)} Estimated Size: {size_mb:.2f} MB")


Dataset: https://huggingface.co/datasets/milkshake721/2.1M-wiki-STEM Estimated Size: 1416.46 MB
Dataset: https://huggingface.co/datasets/izumi-lab/open-text-books Estimated Size: 402.20 MB
Dataset: https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_math_jsonl Estimated Size: 1618.09 MB


In [6]:
# 2. Concatenate all datasets into a single one
print("Merging datasets...")

merged_dataset = concatenate_datasets(datasets_to_merge)

# 3. Save merged dataset to disk (Arrow + JSONL format)
output_dir = "merged_stem_corpus"
merged_dataset.save_to_disk(output_dir)
merged_dataset.to_json(f"{output_dir}/merged_stem_corpus.jsonl", lines=True)

print(f"Dataset successfully merged and saved to '{output_dir}'.")

Merging datasets...


Saving the dataset (0/8 shards):   0%|          | 0/3351932 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/3352 [00:00<?, ?ba/s]

Dataset successfully merged and saved to 'merged_stem_corpus'.


In [7]:
# 4. Push dataset to Hugging Face Hub
dataset_repo_id = "igzi/pile-stem-corpus"

print(f"Pushing dataset to Hugging Face Hub: {dataset_repo_id}")

# Push dataset to hub
merged_dataset.push_to_hub(dataset_repo_id, private=False)  # set private=True if you want

print(f"Dataset successfully pushed to https://huggingface.co/datasets/{dataset_repo_id}")

Pushing dataset to Hugging Face Hub: igzi/pile-stem-corpus


Uploading the dataset shards:   0%|          | 0/8 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/419 [00:00<?, ?ba/s]

Dataset successfully pushed to https://huggingface.co/datasets/igzi/pile-stem-corpus


In [8]:
# gfissore/arxiv-abstracts-2021
arxiv_abstracts = load_dataset("gfissore/arxiv-abstracts-2021", split="train")
arxiv_abstracts = normalize_to_text_with_meta(arxiv_abstracts, ["title", "abstract"], "https://huggingface.co/datasets/gfissore/arxiv-abstracts-2021")
datasets_to_merge.append(arxiv_abstracts)

Map:   0%|          | 0/1999486 [00:00<?, ? examples/s]

In [9]:
pile_dmmath = load_dataset(
    "timaeus/pile-dm_mathematics",
    split="train"
)
pile_dmmath = normalize_to_text_with_meta(pile_dmmath, ["text"], "https://huggingface.co/datasets/timaeus/pile-dm_mathematics/viewer")
datasets_to_merge.append(pile_dmmath)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [10]:
# Assuming datasets_to_merge is your list of datasets with {'text': ..., 'meta': ...}
for ds in datasets_to_merge:
    source_name = ds[0]['source']  # Safe as all in ds share the same source meta
    size_mb = estimate_dataset_size_mb(ds)
    print(f"Dataset: {source_name.ljust(25)} Estimated Size: {size_mb:.2f} MB")

Dataset: https://huggingface.co/datasets/milkshake721/2.1M-wiki-STEM Estimated Size: 1416.46 MB
Dataset: https://huggingface.co/datasets/izumi-lab/open-text-books Estimated Size: 402.20 MB
Dataset: https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_math_jsonl Estimated Size: 1618.09 MB
Dataset: https://huggingface.co/datasets/gfissore/arxiv-abstracts-2021 Estimated Size: 1660.30 MB
Dataset: https://huggingface.co/datasets/timaeus/pile-dm_mathematics/viewer Estimated Size: 781.40 MB


In [11]:
# 2. Concatenate all datasets into a single one
print("Merging datasets...")

merged_dataset = concatenate_datasets(datasets_to_merge)

# 3. Save merged dataset to disk (Arrow + JSONL format)
output_dir = "merged_stem_corpus_extended"
merged_dataset.save_to_disk(output_dir)
merged_dataset.to_json(f"{output_dir}/merged_stem_corpus_extended.jsonl", lines=True)

print(f"Dataset successfully merged and saved to '{output_dir}'.")

Merging datasets...


Saving the dataset (0/14 shards):   0%|          | 0/5451418 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/5452 [00:00<?, ?ba/s]

Dataset successfully merged and saved to 'merged_stem_corpus_extended'.


In [12]:
# 4. Push dataset to Hugging Face Hub
dataset_repo_id = "igzi/pile-stem-corpus-extended"

print(f"Pushing dataset to Hugging Face Hub: {dataset_repo_id}")

# Push dataset to hub
merged_dataset.push_to_hub(dataset_repo_id, private=False)  # set private=True if you want

print(f"Dataset successfully pushed to https://huggingface.co/datasets/{dataset_repo_id}")

Pushing dataset to Hugging Face Hub: igzi/pile-stem-corpus-extended


Uploading the dataset shards:   0%|          | 0/14 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/390 [00:00<?, ?ba/s]

Dataset successfully pushed to https://huggingface.co/datasets/igzi/pile-stem-corpus-extended
