In [7]:
# %%
import os
from datasets import load_dataset
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %%
# Create data directory if it doesn't exist
data_dir = 'data/tinystories'
os.makedirs(data_dir, exist_ok=True)

# %%
# Load the TinyStories dataset
train_dataset = load_dataset('roneneldan/TinyStories', split='train')
val_dataset = load_dataset('roneneldan/TinyStories', split='validation')

# %%
# Define the special tokens
SPECIAL_TOKENS = {
    "eos_token": "<|endoftext|>"
}

# %%
# Initialize the tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = ByteLevel()

# Set up the trainer
VOCAB_SIZE = 1000  # Custom vocabulary size
trainer = BpeTrainer(
    vocab_size=VOCAB_SIZE,
    special_tokens=[SPECIAL_TOKENS["eos_token"], "[UNK]"]
)

# %%
# Prepare the training data for the tokenizer
def get_training_corpus():
    for sample in train_dataset:
        yield sample['text']

In [10]:
# Train the tokenizer
# tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
SPECIAL_TOKENS = {
    "eos_token": "<|endoftext|>"
}
# %%
# Configure the tokenizer post-processing
tokenizer.post_processor = TemplateProcessing(
    single="$A " + SPECIAL_TOKENS["eos_token"],
    special_tokens=[
        (SPECIAL_TOKENS["eos_token"], tokenizer.token_to_id(SPECIAL_TOKENS["eos_token"]))
    ],
)

# Save the tokenizer
# tokenizer.save("tinystories_tokenizer.json")

NameError: name 'tokenizer' is not defined

In [11]:
# Load it with transformers for compatibility
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tinystories_tokenizer.json",
    eos_token="<|endoftext|>",
    unk_token="[UNK]",
    pad_token="[PAD]"  # Optional, but can be useful
)

In [12]:
# %%
# Function to tokenize and save dataset to binary file
def tokenize_and_save(dataset, filename):
    # Initialize a list to hold all tokens
    all_tokens = []

    # Process each text entry in the dataset
    for sample in dataset:
        tokens = tokenizer.encode(sample['text'])
        # Tokens already include the eos_token due to post-processing
        all_tokens.extend(tokens)

    # Convert tokens to NumPy array
    arr = np.array(all_tokens, dtype=np.uint16)

    # Prepare the header
    header = np.zeros(256, dtype=np.int32)
    header[0] = 20240520  # Magic number
    header[1] = 1         # Version
    header[2] = arr.size  # Number of tokens

    # Save to binary file
    with open(filename, 'wb') as f:
        f.write(header.tobytes())
        arr.tofile(f)

In [13]:
# %%
# Tokenize and save the validation dataset
val_bin_path = os.path.join(data_dir, 'val.bin')
tokenize_and_save(val_dataset, val_bin_path)

In [15]:
# %%
# Tokenize and save the training dataset
train_bin_path = os.path.join(data_dir, 'train.bin')
tokenize_and_save(train_dataset, train_bin_path)

KeyboardInterrupt: 

In [16]:
import numpy as np
# Verify the maximum token ID is within the vocabulary size
train_bin_path = 'train.bin'
val_bin_path = 'val.bin'
def verify_tokens(filename):
    with open(filename, 'rb') as f:
        header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
        tokens = np.frombuffer(f.read(), dtype=np.uint16)
    max_token_id = tokens.max()
    print(f"Maximum token id in {filename}: {max_token_id}")
    print(f"Vocabulary size: {tokenizer.vocab_size}")
    print(len(header))
    print(len(tokens), '\n')

verify_tokens(train_bin_path)
verify_tokens(val_bin_path)

Maximum token id in train.bin: 999
Vocabulary size: 1000
256
611289391 

Maximum token id in val.bin: 999
Vocabulary size: 1000
256
6150326 



In [2]:
%pwd

'/home/jovyan/fokin/modded-nanogpt/data/tinystories'

In [3]:
import json

# Load the JSON files
with open('/home/jovyan/fokin/modded-nanogpt/data/tinystories/tinystories_tokenizer.json') as f1, open('/home/jovyan/fokin/modded-nanogpt/tinystories_tokenizer.json') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Compare the JSON objects
if data1 == data2:
    print("The JSON files are identical.")
else:
    print("The JSON files differ.")

The JSON files are identical.
