## Split Dataset into Chunks

### Read Tokenizer

In [7]:
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config

# Load the GPT2 configuration
config = GPT2Config()

# Load the tokenizer from the local directory)
tokenizer = ByteLevelBPETokenizer(
    "./tokenizer/vocab.json",
    "./tokenizer/merges.txt",
    add_prefix_space=True
)

### Load Carolina Wik Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("carolina-c4ai/corpus-carolina", taxonomy="wik")['corpus']

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Shuffle the Dataset

In [3]:
dataset = dataset.shuffle(seed=42)

### Create tokenized dataset

In [4]:
import torch
from tqdm import tqdm

def generate_chunk(dataset, target_size):
    current_chunk = torch.empty((0,))
    for entry in dataset['text']:
        # Tokenize entry
        entry = tokenizer.encode(entry, add_special_tokens=False).ids
        entry = torch.asarray(entry, dtype=torch.long)

        # Update chunk
        current_chunk = torch.cat([current_chunk, entry])

        # Yield chunks
        while current_chunk.shape[0] > target_size:
            yield current_chunk[:target_size]
            current_chunk = current_chunk[target_size:]


In [5]:
import zarr

CHUNK_SIZE = config.n_positions - 2

zarr_array = zarr.open(
    "dataset.zarr",
    mode='w',
    shape=(0, CHUNK_SIZE),
    dtype='long',
    chunks=(1, CHUNK_SIZE)
)

for chunk in tqdm(generate_chunk(dataset, CHUNK_SIZE)):
    current_shape = zarr_array.shape
    new_shape = list(current_shape)
    new_shape[0] += 1
    zarr_array.resize(tuple(new_shape))
    zarr_array[-1, :] = chunk

550312it [1:42:30, 89.47it/s]


In [8]:
import zarr
old_zarr = zarr.open("dataset.zarr", "r")
CHUNK_SIZE = config.n_positions - 2

In [11]:
new_zarr = zarr.open("dataset_copy.zarr", "w", shape=old_zarr.shape, dtype="int", chunk=(2048, CHUNK_SIZE))

In [12]:
new_zarr[:] = old_zarr[:]

### 