# MiniPile from (Deduplicated) Pile

[EleutherAI/the_pile_deduplicated](https://huggingface.co/datasets/EleutherAI/the_pile_deduplicated)

In [None]:
# !pip install datasets
from datasets import load_dataset

dataset = load_dataset("EleutherAI/the_pile_deduplicated", split="train", streaming=True)

# Example: Iterating over the dataset
for example in dataset:
    print(example['text'])  # Process each example

# Example: Grab first example
first_example = next(iter(dataset))
print(first_example['text'])

# Example: Shuffle the dataset with a buffer of 10000 and a seed of 42
shuffled_dataset = dataset.shuffle(buffer_size=10_000, seed=42)

# Example: Take the first 1000 examples
subset = dataset.take(1000)

# Example: Skip the first 1000 examples
skipped_dataset = dataset.skip(1000)

# Example: Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["text"]) # Some Tokenizer
tokenized_dataset = dataset.map(tokenize_function, batched=True)

### Usage by Tech Companies
Several major tech and AI companies have reportedly used The Pile for training their models:

- Anthropic
- Salesforce
- Apple (for training OpenELM)
- Nvidia
- Bloomberg
- Databricks

### Ethical and Legal Considerations

- Copyright Concerns: The inclusion of YouTube content and other web-scraped data has led to discussions about copyright and fair use in AI training datasets.

## Download Full Dataset to specific folder

In [None]:
import os
from pathlib import Path
from datasets import load_dataset
from huggingface_hub import snapshot_download

# Set custom directories
down_dir = Path("/media/marcus/Festplatte/")
target_path = down_dir / "Pile_Deduplicated"
cache_path = down_dir / "Pile_Deduplicated_Cache"

In [11]:
# Create directories
os.makedirs(target_path, exist_ok=True)
os.makedirs(cache_path, exist_ok=True)

# Override all relevant environment variables
os.environ['HF_HOME'] = str(cache_path)
os.environ['HF_DATASETS_CACHE'] = str(cache_path)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(cache_path)
os.environ['TRANSFORMERS_CACHE'] = str(cache_path)

print("Downloading dataset...")

# First, download the dataset files
repo_id = "EleutherAI/the_pile_deduplicated"

while True:
    try:
        snapshot_download(repo_id, repo_type="dataset", cache_dir=str(cache_path), local_dir=str(target_path))
        break
    except Exception as _:
        continue

Downloading dataset...


Fetching 1652 files:  41%|████      | 681/1652 [00:16<01:00, 16.01it/s] Error while downloading from https://cdn-lfs.hf.co/repos/6c/d2/6cd2fa47d3ac88ee082caf5d269c09d9cdb5188332344300455675c2188627f2/c3e1554544ecb8849e18afdbb41c0d1920472dcac4ad34982f8a7ebe7482750f?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train-00696-of-01650-69d6ee48392821e3.parquet%3B+filename%3D%22train-00696-of-01650-69d6ee48392821e3.parquet%22%3B&Expires=1727559012&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNzU1OTAxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy82Yy9kMi82Y2QyZmE0N2QzYWM4OGVlMDgyY2FmNWQyNjljMDlkOWNkYjUxODgzMzIzNDQzMDA0NTU2NzVjMjE4ODYyN2YyL2MzZTE1NTQ1NDRlY2I4ODQ5ZTE4YWZkYmI0MWMwZDE5MjA0NzJkY2FjNGFkMzQ5ODJmOGE3ZWJlNzQ4Mjc1MGY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=RqPsGLLHsGTxJRSGa2m5UKJU3fkxaFy199OgMvoF%7ESyi1ocBmTatfnrJ6be9ShS4js19l%7EF-kMUxOJTY1b0UEUsP9G8xipiUlivNRn-D6LjuMUT80f8qy4QCOFeridFd8bUkh4Z

In [None]:
# Then, load the dataset from the local files
dataset = load_dataset(
    "parquet",
    data_files=str(target_path / "**" / "*.parquet"), 
    cache_dir=str(cache_path)
)

print(f"Dataset downloaded to: {target_path}")
print(f"Dataset info: {dataset}")