In [2]:
from transformers import GPT2Config, GPT2Tokenizer
from datasets import load_dataset
import numpy as np
import zarr
from tqdm import tqdm
from datasets import Dataset

# Load the GPT2 configuration
config = GPT2Config()

# Load the tokenizer from the local directory
tokenizer = GPT2Tokenizer(
    vocab_file="./tokenizer/vocab.json",
    merges_file="./tokenizer/merges.txt"
)

# Configure tokenizer settings
tokenizer.model_max_length = config.n_positions
tokenizer.eos_token = tokenizer.bos_token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the dataset
dataset = load_dataset("carolina-c4ai/corpus-carolina", taxonomy="wik")['corpus']

# Define the padding and truncation function
def tokenize_function(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',  # Pad to the maximum length of the model
        truncation=True,       # Truncate if the text is longer than the maximum length
        max_length=config.n_positions,
        return_tensors='np'    # Return NumPy arrays
    )

# Create a Zarr group to store the tokenized data
zarr_save_path = 'dataset.zarr'
zarr_group = zarr.open_group(zarr_save_path, mode='w')
zarr_input_ids = zarr_group.create_dataset('input_ids', shape=(0, config.n_positions), maxshape=(None, config.n_positions), chunks=(1000, config.n_positions), dtype='int')
zarr_attention_mask = zarr_group.create_dataset('attention_mask', shape=(0, config.n_positions), maxshape=(None, config.n_positions), chunks=(1000, config.n_positions), dtype='int')

# Tokenize and save the dataset incrementally
batch_size = 1000  # Adjust batch size as needed
for i in tqdm(range(0, len(dataset), batch_size)):
    batch = dataset[i:i+batch_size]
    tokenized_batch = tokenize_function(batch)

    input_ids = tokenized_batch['input_ids']
    attention_mask = tokenized_batch['attention_mask']
    
    zarr_input_ids.append(input_ids)
    zarr_attention_mask.append(attention_mask)

print(f"Tokenized dataset saved to {zarr_save_path}")


_dict = {"input_ids": zarr_group.input_ids, "attention_mask": zarr_group.attention_mask}

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
  warn("ignoring keyword argument %r" % k)
100%|██████████| 961/961 [25:23<00:00,  1.59s/it]

Tokenized dataset saved to dataset.zarr





In [4]:
import tempfile
import os

# Create a temporary directory for saving chunks
temp_dir = tempfile.mkdtemp()

# Save the tokenized data in smaller chunks
chunk_size = 10000  # Adjust chunk size as needed
chunk_paths = []

for start in tqdm(range(0, len(zarr_input_ids), chunk_size)):
    end = start + chunk_size
    input_ids_chunk = np.array(zarr_input_ids[start:end])
    attention_mask_chunk = np.array(zarr_attention_mask[start:end])
    
    dataset_dict = {"input_ids": input_ids_chunk, "attention_mask": attention_mask_chunk}
    chunk_dataset = Dataset.from_dict(dataset_dict)
    
    chunk_path = os.path.join(temp_dir, f'chunk_{start//chunk_size}')
    chunk_dataset.save_to_disk(chunk_path)
    chunk_paths.append(chunk_path)

  0%|          | 0/97 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  1%|          | 1/97 [00:00<01:09,  1.39it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  2%|▏         | 2/97 [00:01<01:02,  1.52it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  3%|▎         | 3/97 [00:01<00:59,  1.59it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  4%|▍         | 4/97 [00:02<00:56,  1.66it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  5%|▌         | 5/97 [00:03<00:55,  1.65it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  6%|▌         | 6/97 [00:03<00:53,  1.70it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  7%|▋         | 7/97 [00:04<00:52,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  8%|▊         | 8/97 [00:04<00:50,  1.75it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

  9%|▉         | 9/97 [00:05<00:50,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 10%|█         | 10/97 [00:05<00:49,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 11%|█▏        | 11/97 [00:06<00:48,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 12%|█▏        | 12/97 [00:07<00:48,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 13%|█▎        | 13/97 [00:07<00:47,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 14%|█▍        | 14/97 [00:08<00:48,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 15%|█▌        | 15/97 [00:08<00:47,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 16%|█▋        | 16/97 [00:09<00:46,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 18%|█▊        | 17/97 [00:09<00:47,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 19%|█▊        | 18/97 [00:10<00:46,  1.70it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 20%|█▉        | 19/97 [00:11<00:45,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 21%|██        | 20/97 [00:11<00:45,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 22%|██▏       | 21/97 [00:12<00:43,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 23%|██▎       | 22/97 [00:12<00:42,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 24%|██▎       | 23/97 [00:13<00:41,  1.79it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 25%|██▍       | 24/97 [00:13<00:41,  1.75it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 26%|██▌       | 25/97 [00:14<00:41,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 27%|██▋       | 26/97 [00:15<00:41,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 28%|██▊       | 27/97 [00:15<00:40,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 29%|██▉       | 28/97 [00:16<00:40,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 30%|██▉       | 29/97 [00:16<00:40,  1.67it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 31%|███       | 30/97 [00:17<00:43,  1.53it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 32%|███▏      | 31/97 [00:18<00:42,  1.56it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 33%|███▎      | 32/97 [00:18<00:41,  1.58it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 34%|███▍      | 33/97 [00:19<00:39,  1.62it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 35%|███▌      | 34/97 [00:20<00:37,  1.67it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 36%|███▌      | 35/97 [00:20<00:37,  1.66it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 37%|███▋      | 36/97 [00:21<00:35,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 38%|███▊      | 37/97 [00:21<00:36,  1.66it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 39%|███▉      | 38/97 [00:22<00:35,  1.67it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 40%|████      | 39/97 [00:23<00:34,  1.70it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 41%|████      | 40/97 [00:23<00:35,  1.62it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 42%|████▏     | 41/97 [00:24<00:34,  1.65it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 43%|████▎     | 42/97 [00:24<00:32,  1.68it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 44%|████▍     | 43/97 [00:25<00:33,  1.60it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 45%|████▌     | 44/97 [00:26<00:32,  1.64it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 46%|████▋     | 45/97 [00:26<00:31,  1.67it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 47%|████▋     | 46/97 [00:27<00:30,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 48%|████▊     | 47/97 [00:27<00:29,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 49%|████▉     | 48/97 [00:28<00:28,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 51%|█████     | 49/97 [00:28<00:27,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 52%|█████▏    | 50/97 [00:29<00:26,  1.75it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 53%|█████▎    | 51/97 [00:30<00:26,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 54%|█████▎    | 52/97 [00:30<00:26,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 55%|█████▍    | 53/97 [00:31<00:25,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 56%|█████▌    | 54/97 [00:31<00:25,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 57%|█████▋    | 55/97 [00:32<00:24,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 58%|█████▊    | 56/97 [00:33<00:23,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 59%|█████▉    | 57/97 [00:33<00:23,  1.70it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 60%|█████▉    | 58/97 [00:34<00:22,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 61%|██████    | 59/97 [00:34<00:21,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 62%|██████▏   | 60/97 [00:35<00:21,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 63%|██████▎   | 61/97 [00:35<00:20,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 64%|██████▍   | 62/97 [00:36<00:20,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 65%|██████▍   | 63/97 [00:37<00:19,  1.75it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 66%|██████▌   | 64/97 [00:37<00:18,  1.75it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 67%|██████▋   | 65/97 [00:38<00:18,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 68%|██████▊   | 66/97 [00:38<00:17,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 69%|██████▉   | 67/97 [00:39<00:17,  1.70it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 70%|███████   | 68/97 [00:40<00:16,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 71%|███████   | 69/97 [00:40<00:16,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 72%|███████▏  | 70/97 [00:41<00:15,  1.72it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 73%|███████▎  | 71/97 [00:41<00:14,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 74%|███████▍  | 72/97 [00:42<00:14,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 75%|███████▌  | 73/97 [00:42<00:13,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 76%|███████▋  | 74/97 [00:43<00:13,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 77%|███████▋  | 75/97 [00:44<00:12,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 78%|███████▊  | 76/97 [00:44<00:11,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 79%|███████▉  | 77/97 [00:45<00:11,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 80%|████████  | 78/97 [00:45<00:10,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 81%|████████▏ | 79/97 [00:46<00:10,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 82%|████████▏ | 80/97 [00:46<00:09,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 84%|████████▎ | 81/97 [00:47<00:09,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 85%|████████▍ | 82/97 [00:47<00:08,  1.78it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 86%|████████▌ | 83/97 [00:48<00:08,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 87%|████████▋ | 84/97 [00:49<00:07,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 88%|████████▊ | 85/97 [00:49<00:06,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 89%|████████▊ | 86/97 [00:50<00:06,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 90%|████████▉ | 87/97 [00:50<00:05,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 91%|█████████ | 88/97 [00:51<00:05,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 92%|█████████▏| 89/97 [00:51<00:04,  1.74it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 93%|█████████▎| 90/97 [00:52<00:03,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 94%|█████████▍| 91/97 [00:53<00:03,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 95%|█████████▍| 92/97 [00:53<00:02,  1.76it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 96%|█████████▌| 93/97 [00:54<00:02,  1.77it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 97%|█████████▋| 94/97 [00:54<00:01,  1.73it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 98%|█████████▊| 95/97 [00:55<00:01,  1.69it/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

 99%|█████████▉| 96/97 [00:56<00:00,  1.71it/s]

Saving the dataset (0/1 shards):   0%|          | 0/139 [00:00<?, ? examples/s]

100%|██████████| 97/97 [00:56<00:00,  1.73it/s]


AttributeError: 'Dataset' object has no attribute 'concatenate'

In [6]:
import datasets
dss = [Dataset.load_from_disk(chunk_path) for chunk_path in chunk_paths]

final_dataset = datasets.concatenate_datasets(dss)

In [7]:
final_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/10 shards):   0%|          | 0/960139 [00:00<?, ? examples/s]

In [9]:
ds = datasets.load_from_disk("tokenized_dataset")