In [1]:
import os
import getpass

from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from tqdm import tqdm

In [2]:

load_dotenv('.envrc')
if 'HF_TOKEN' in os.environ:
    login(token=os.environ['HF_TOKEN'])
else:
    login(token=getpass.getpass('Huggingface token: '))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
CHUNK_SIZE = 128
TARGET_SIZES = [
    # 16, 32,
    64]
for target_size in TARGET_SIZES:
    assert CHUNK_SIZE % target_size == 0

INPUT_DATASET = 'MikiV/SimpleStories-SimpleStories-chunked-128'
OUTPUT_DATASET_NAMES = [INPUT_DATASET.replace('chunked', 'subchunked').replace(str(CHUNK_SIZE), f'{target_size}x{CHUNK_SIZE // target_size}')
                       for target_size in TARGET_SIZES]

In [4]:
def split_into_subchunks(example, chunk_size, subchunk_size):
    text = example['input_ids']
    subchunks = [
        {'input_ids': text[i*subchunk_size:(i+1)*subchunk_size]} for i in range(chunk_size // subchunk_size)
    ]
    return {'subchunks': subchunks}

In [5]:
input_dataset = load_dataset(INPUT_DATASET)
print(f"Loaded dataset: {input_dataset}")

for target_size, output_dataset_name in zip(TARGET_SIZES, OUTPUT_DATASET_NAMES):
    split = lambda x: split_into_subchunks(x, CHUNK_SIZE, target_size)
    output_dataset = DatasetDict({
        split_name: input_dataset[split_name].map(
            split,
            batched=False,
            remove_columns=input_dataset[split_name].column_names,
            num_proc=4
        )
        for split_name in input_dataset.keys()
    })
    print(f"{output_dataset_name}: {output_dataset}")
    output_dataset.push_to_hub(output_dataset_name, private=True)

Loaded dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3653646
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 152426
    })
})
MikiV/SimpleStories-SimpleStories-subchunked-64x2: DatasetDict({
    train: Dataset({
        features: ['subchunks'],
        num_rows: 3653646
    })
    validation: Dataset({
        features: ['subchunks'],
        num_rows: 152426
    })
})


Uploading the dataset shards:   0%|          | 0/8 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/390 [00:00<?, ?B/s]