## Create Chunked Dataset

In [None]:
import os
import getpass

from datasets import load_dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from tqdm.auto import tqdm


In [9]:
DATASET_REPO = 'MikiV/gutenberg_children_books_cleaned'

CHUNK_LENGTH = 8192

In [10]:

train_dataset = load_dataset(DATASET_REPO, split='train', )
val_dataset = load_dataset(DATASET_REPO, split='test')



README.md:   0%|          | 0.00/549 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/58.1M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [11]:


def chunk_dataset_with_map(dataset, token_column='input_ids', chunk_length=CHUNK_LENGTH):
    """
    Memory-efficient version using dataset.map() and batched processing.
    """
    
    def chunk_batch(batch):
        chunked_batch = {col: [] for col in batch.keys()}
        
        for i in range(len(batch[token_column])):
            tokens = batch[token_column][i]
            token_count = len(tokens)
            num_chunks = token_count // chunk_length
            
            for chunk_idx in range(num_chunks):
                start_idx = chunk_idx * chunk_length
                end_idx = start_idx + chunk_length
                chunk_tokens = tokens[start_idx:end_idx]
                
                # Add chunk to batch
                for col in batch.keys():
                    if col == token_column:
                        chunked_batch[col].append(chunk_tokens)
                    else:
                        chunked_batch[col].append(batch[col][i])
        
        return chunked_batch
    
    # Use map wiph batched=True for memory efficiency
    chunked_dataset = dataset.map(
        chunk_batch,
        batched=True,
        remove_columns=dataset.column_names,  # Remove original columns
        batch_size=1000,  # Adjust based on your memory constraints
    )
    
    return chunked_dataset
chunked_dataset_train = chunk_dataset_with_map(train_dataset, chunk_length=CHUNK_LENGTH)
chunked_dataset_val = chunk_dataset_with_map(val_dataset, chunk_length=CHUNK_LENGTH)

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
print("Train dataset length:", len(chunked_dataset_train))
print("Validation dataset length:", len(chunked_dataset_val))
DS_REPO_NAME = DATASET_REPO.replace('/', '-') + f'-chunked-{CHUNK_LENGTH}'

dict_dataset = DatasetDict({
    'train': chunked_dataset_train,
    'validation': chunked_dataset_val,
})

dict_dataset.push_to_hub(f"MikiV/{DS_REPO_NAME}", private=False)




Train dataset length: 1593
Validation dataset length: 26


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/MikiV/MikiV-gutenberg_children_books_cleaned-chunked-8192/commit/6e74585a77560e399ca557b0ad5a075547e8bc6e', commit_message='Upload dataset', commit_description='', oid='6e74585a77560e399ca557b0ad5a075547e8bc6e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MikiV/MikiV-gutenberg_children_books_cleaned-chunked-8192', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MikiV/MikiV-gutenberg_children_books_cleaned-chunked-8192'), pr_revision=None, pr_num=None)