In [None]:
%load_ext autoreload
%autoreload 2
%load_ext memory_profiler

# Train BPETokenizer

## Train BPETokenizer on TinyStories.

In [None]:
from cs336_basics.tokenizer import BPETokenizer
from cs336_basics.utils import ROOT_PATH

tok = BPETokenizer()
with open(ROOT_PATH / "data/TinyStoriesV2-GPT4-train.txt", "r") as f:
    text = f.read()
tok.train(text, vocab_size=10000, verbose=True)
tok.save(ROOT_PATH / "tokenizer/tinystories_train_10000.pt")

In [None]:
# Check compression ratio.
from cs336_basics.utils import compression_ratio
from cs336_basics.tokenizer import BPETokenizer
from cs336_basics.utils import ROOT_PATH

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/tinystories_train_10000.pt")

with open(ROOT_PATH / "data/TinyStoriesV2-GPT4-valid.txt", "r") as f:
    text = f.read(1_000_000) # Read only 1M characters.
    print(f"Compression ratio on TinyStories: {compression_ratio(tok, text):.2f}")
    # Compression ratio on TinyStories: 4.11

with open(ROOT_PATH / "data/owt_valid.txt", "r") as f:
    text = f.read(1_000_000) # Read only 1M characters.
    print(f"Compression ratio on OpenWebText: {compression_ratio(tok, text):.2f}")
    # Compression ratio on OpenWebText: 3.22

## Train BPETokenizer on OpenWebText.

In [None]:
from cs336_basics.tokenizer import BPETokenizer
from cs336_basics.utils import read_file_to_str_iterable
from cs336_basics.utils import ROOT_PATH

tok = BPETokenizer()
# Use a generator instead of reading the whole file into memory.
texts = read_file_to_str_iterable(ROOT_PATH / "data/owt_train.txt", special_tokens=["<|endoftext|>"], buffer_size_bytes=100_000_000)
tok.train_iterable(texts, vocab_size=32000, verbose=True)
tok.save(ROOT_PATH / "tokenizer/owt_train_32000.pt")
# Pretokenizing: 119it [26:27, 13.34s/it]
# Computing pair counts: 6601892it [00:07, 857637.28it/s]
# Merging: 100%|██████████| 31743/31743 [42:27<00:00, 12.46it/s]

In [None]:
# Check compression rate on train and validation data.
from cs336_basics.utils import compression_ratio
from cs336_basics.tokenizer import BPETokenizer
from cs336_basics.utils import ROOT_PATH

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/owt_train_32000.pt")

with open(ROOT_PATH / "data/TinyStoriesV2-GPT4-valid.txt", "r") as f:
    text = f.read(1_000_000) # Read only 1M characters.
    print(f"Compression ratio on TinyStories: {compression_ratio(tok, text):.2f}")
    # Compression ratio on TinyStories: 4.00

with open(ROOT_PATH / "data/owt_valid.txt", "r") as f:
    text = f.read(1_000_000) # Read only 1M characters.
    print(f"Compression ratio on OpenWebText: {compression_ratio(tok, text):.2f}")
    # Compression ratio on OpenWebText: 4.48

# Encode train and validation data

## Encode TinyStories

In [None]:
import numpy as np
from cs336_basics.utils import ROOT_PATH, read_file_to_str_iterable
from cs336_basics.tokenizer import BPETokenizer

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/tinystories_train_10000.pt")

texts = read_file_to_str_iterable(
    ROOT_PATH / "data/TinyStoriesV2-GPT4-train.txt",
    special_tokens=["<|endoftext|>"],
    buffer_size_bytes=10_000_000,  # Read and encode ~10MB per time.
)
tokens = tok.encode_iterable(texts, verbose=True)
np.save(ROOT_PATH / "data/TinyStoriesV2-GPT4-train-tokens.npy", np.array(list(tokens), dtype=np.uint16))

texts = read_file_to_str_iterable(
    ROOT_PATH / "data/TinyStoriesV2-GPT4-valid.txt",
    special_tokens=["<|endoftext|>"],
    buffer_size_bytes=10_000_000,  # Read and encode ~10MB per time.
)
tokens = tok.encode_iterable(texts, verbose=True)
np.save(ROOT_PATH / "data/TinyStoriesV2-GPT4-valid-tokens.npy", np.array(list(tokens), dtype=np.uint16))    

In [None]:
from cs336_basics.tokenizer import BPETokenizer

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/tinystories_train_10000.pt")

tokens_memmap = np.load(ROOT_PATH / "data/TinyStoriesV2-GPT4-valid-tokens.npy", mmap_mode="r")

# Decode few pieces to visually check correctness.
print(tok.decode(tokens_memmap[0:100]))
print("------------------------------------------------------")
print(tok.decode(tokens_memmap[len(tokens_memmap)//2:len(tokens_memmap)//2+100]))
print("------------------------------------------------------")
print(tok.decode(tokens_memmap[len(tokens_memmap)-100:len(tokens_memmap)]))

# Check that decoding the tokens obtains the original text.
print("------------------------------------------------------")
with open(ROOT_PATH / "data/TinyStoriesV2-GPT4-valid.txt", "r") as f:
    text_orig = f.read()
text_dec = tok.decode(tokens_memmap)
print("Decoding works:", text_orig == text_dec)

## Encode OpenWebText

Experiment here with flushing to memory the tokens every X MB. This allows
keeping RAM consumption low at the cost of complicating a bit saving and loading
of the numpy array as we have to write a byte file.

In [None]:
from cs336_basics.tokenizer import BPETokenizer
import numpy as np
from cs336_basics.utils import ROOT_PATH, read_file_to_str_iterable, write_int_iterable_to_byte_file

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/owt_train_32000.pt")

texts = read_file_to_str_iterable(
    ROOT_PATH / "data/owt_train.txt",
    special_tokens=["<|endoftext|>"],
    buffer_size_bytes=10_000_000,  # Read and encode ~10MB per time.
)
tokens = tok.encode_iterable(texts, verbose=True)
write_int_iterable_to_byte_file(
    ROOT_PATH / "data/owt_train_tokens.bin",
    ROOT_PATH / "data/owt_train_tokens_metadata.json",
    tokens,
    dtype=np.uint16,
    buffer_size_bytes=100_000_000,  # Flush to memory every ~100MB.
)
# Encoding: 1264it [3:31:56, 10.06s/it]

texts = read_file_to_str_iterable(
    ROOT_PATH / "data/owt_valid.txt",
    special_tokens=["<|endoftext|>"],
    buffer_size_bytes=10_000_000,  # Read and encode ~10MB per time.
)
tokens = tok.encode_iterable(texts, verbose=True)
write_int_iterable_to_byte_file(
    ROOT_PATH / "data/owt_valid_tokens.bin",
    ROOT_PATH / "data/owt_valid_tokens_metadata.json",
    tokens,
    dtype=np.uint16,
    buffer_size_bytes=100_000_000,  # Flush to memory every ~100MB.
)

In [None]:
from cs336_basics.tokenizer import BPETokenizer
from cs336_basics.utils import ROOT_PATH, read_byte_file_to_memmap

tok = BPETokenizer.load(ROOT_PATH / "tokenizer/owt_train_32000.pt")

tokens_memmap = read_byte_file_to_memmap(
    ROOT_PATH / "data/owt_valid_tokens.bin",
    ROOT_PATH / "data/owt_valid_tokens_metadata.json")

# Decode few pieces to visually check correctness.
print(tok.decode(tokens_memmap[0:100]))
print("------------------------------------------------------")
print(tok.decode(tokens_memmap[len(tokens_memmap)//2:len(tokens_memmap)//2+100]))
print("------------------------------------------------------")
print(tok.decode(tokens_memmap[len(tokens_memmap)-100:len(tokens_memmap)]))

# Check that decoding the tokens obtains the original text.
print("------------------------------------------------------")
with open(ROOT_PATH / "data/owt_valid.txt", "r") as f:
    text_orig = f.read()
text_dec = tok.decode(tokens_memmap)
print("Decoding works:", text_orig == text_dec)

# Train LLM

In [3]:
x = 10

In [5]:
x

10