In [1]:
import os
import getpass

from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from tqdm import tqdm

## Create Chunked Dataset

In [2]:

load_dotenv('.envrc')
if 'HF_TOKEN' in os.environ:
    login(token=os.environ['HF_TOKEN'])
else:
    login(token=getpass.getpass('Huggingface token: '))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
CHUNK_SIZE = 128
SUBCHUNK_SIZE = 64
assert CHUNK_SIZE % SUBCHUNK_SIZE == 0
SUBCHUNK_RATIO = CHUNK_SIZE // SUBCHUNK_SIZE

LARGE_CHUNKED_DATASET = 'MikiV/SimpleStories-SimpleStories-chunked-128'
SUBCHUNK_DATASET_NAME = LARGE_CHUNKED_DATASET.replace('chunked', 'subchunked').replace(str(CHUNK_SIZE), f'{SUBCHUNK_SIZE}x{SUBCHUNK_RATIO}')

In [4]:
chunked_dataset = load_dataset(LARGE_CHUNKED_DATASET)
chunked_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3653829
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 152243
    })
})

In [None]:
def split_into_subchunks(example):
    text = example['input_ids']
    subchunks = [
        {'input_ids': text[i*SUBCHUNK_SIZE:(i+1)*SUBCHUNK_SIZE]}
        for i in range(SUBCHUNK_RATIO)
    ]
    return {'subchunks': subchunks}

subchunked_train = chunked_dataset['train'].map(split_into_subchunks, batched=False)

train_data = []
for item in subchunked_train:
    train_data.extend(item['subchunks'])

subchunked_dataset = DatasetDict({
    'train': Dataset.from_list(train_data),
    'test': chunked_dataset['test']
})

Map:   0%|          | 0/3653829 [00:00<?, ? examples/s]

In [None]:
print(f"Train set size: {len(subchunked_dataset['train'])} samples")
print(f"Original train set size: {len(chunked_dataset['train'])} samples")