In [6]:
import os
import getpass

from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from tqdm import tqdm

## Create Chunked Dataset

In [7]:

load_dotenv('.envrc')
if 'HF_TOKEN' in os.environ:
    login(token=os.environ['HF_TOKEN'])
else:
    login(token=getpass.getpass('Huggingface token: '))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [8]:
CHUNK_SIZE = 128
SUBCHUNK_SIZE = 64
assert CHUNK_SIZE % SUBCHUNK_SIZE == 0
SUBCHUNK_RATIO = CHUNK_SIZE // SUBCHUNK_SIZE

LARGE_CHUNKED_DATASET = 'MikiV/SimpleStories-SimpleStories-chunked-128'
SUBCHUNK_DATASET_NAME = LARGE_CHUNKED_DATASET.replace('chunked', 'subchunked').replace(str(CHUNK_SIZE), f'{SUBCHUNK_SIZE}x{SUBCHUNK_RATIO}')

In [9]:
chunked_dataset = load_dataset(LARGE_CHUNKED_DATASET)
chunked_dataset


README.md:   0%|          | 0.00/390 [00:00<?, ?B/s]

data/train-00000-of-00004.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

data/train-00001-of-00004.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

data/train-00002-of-00004.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

data/train-00003-of-00004.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/29.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3653646 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/152426 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3653646
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 152426
    })
})

In [11]:
def split_into_subchunks(example):
    # `example` is a single example (not a batch) when batched=False
    text = example['input_ids']
    subchunks = [
        {'input_ids': text[i*SUBCHUNK_SIZE:(i+1)*SUBCHUNK_SIZE]} for i in range(SUBCHUNK_RATIO)
    ]
    # return a plain list of subchunks so iteration is simple
    return {'subchunks': subchunks}

# map with batched=False so each mapped item is a dict (not lists),
# avoiding the issue where mapping with batched=True can return strings in some versions
subchunked_train = chunked_dataset['train'].map(split_into_subchunks, batched=False, remove_columns=chunked_dataset['train'].column_names, num_proc=4)

train_data = []
for item in subchunked_train:
    # each `item` is a dict with key 'subchunks' holding a list of dicts
    train_data.extend(item['subchunks'])

# create a Dataset directly from the list of dicts
subchunked_dataset = DatasetDict({
    'train': Dataset.from_list(train_data),
    'test': chunked_dataset['test']
})

Map (num_proc=4):   0%|          | 0/3653646 [00:00<?, ? examples/s]

In [12]:
print(f"Train set size: {len(subchunked_dataset['train'])} samples")
print(f"Original train set size: {len(chunked_dataset['train'])} samples")

Train set size: 7307292 samples
Original train set size: 3653646 samples


In [13]:
subchunked_dataset.push_to_hub(SUBCHUNK_DATASET_NAME, private=True)

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/MikiV/SimpleStories-SimpleStories-subchunked-64x2/commit/8029d6ea804b47a0c7a42e5286401c9b3463f9df', commit_message='Upload dataset', commit_description='', oid='8029d6ea804b47a0c7a42e5286401c9b3463f9df', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MikiV/SimpleStories-SimpleStories-subchunked-64x2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MikiV/SimpleStories-SimpleStories-subchunked-64x2'), pr_revision=None, pr_num=None)