In [None]:
!pip install datasets
!pip install tiktoken
!pip install gutenbergpy

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

# Importing Necessary Libraries

In [None]:
import re
import importlib
import tiktoken
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import requests

# Getting Our Corpus

The notebook retrieves the full text of Moby-Dick from Project Gutenberg. The text is stored in the variable **corpus** for tokenization.

In [None]:
url = "https://www.gutenberg.org/files/2701/2701-0.txt" # The  Moby Dick book will be used for this
response = requests.get(url)

corpus = response.text

# Creating Basic Tokenizer from Corpus

This initial version of the tokenizer processes the corpus by:
* Splitting the text using regular expressions to capture common punctuation and whitespace as tokens.
* Constructing a vocabulary as a sorted, unique list of words and punctuation marks from the corpus.
* Mapping each word to a unique ID in **vocab_to_id** for encoding and in **id_to_vocab** for decoding.

The tokenizer provides two main functions:
1. **encode(text)**: Converts a given text into a list of token IDs based on the vocabulary.
2. **decode(tokens)**: Reconstructs the original text from token IDs, reattaching punctuation correctly.

In [None]:
class LLMBasicTokenizerV1:
  def __init__(self, corpus):
    processed_corpus = re.split(r'([,.;:?!_"()\']--|\s)', corpus)
    processed_corpus = [token.strip() for token in processed_corpus if token.strip()]

    vocab = sorted(set(processed_corpus))
    self.vocab_to_id = {word:i for i, word in enumerate(vocab)}
    self.id_to_vocab = {i:word for i, word in enumerate(vocab)}

  def encode(self, text):
    split_text = re.split(r'([,.;:?!_"()\']--|\s)', text)
    return [self.vocab_to_id[token.strip()] for token in split_text if token.strip() in self.vocab_to_id]

  def decode(self, tokens):
    joined_tokens = " ".join([self.id_to_vocab[token] for token in tokens])
    return re.sub(r'\s+([,.;:?!_"()\'])', r'\1', joined_tokens)


In [None]:
tokenizer = LLMBasicTokenizerV1(corpus)
text = """Can you please, please give me your name so I can see who you actually are."""
token_ids = tokenizer.encode(text)
decoded_text = tokenizer.decode(token_ids)
print(token_ids)
print(decoded_text)

[1046, 32687, 22807, 22805, 14878, 19774, 32705, 20703, 26968, 2365, 8371, 25732, 32086, 32687, 5284, 6145]
Can you please, please give me your name so I can see who you actually are.


# Creating a More Advanced Tokenizer

* This version extends LLMBasicTokenizerV1 by introducing special tokens, <|endoftext|> and <|unk|>, to handle text boundaries and unknown words.
* This tokenizer allows better handling of unknown or out-of-vocabulary tokens by introducing an unknown token placeholder.
* Like LLMBasicTokenizerV1, it provides the encode and decode methods, enhancing flexibility for more robust tokenization.

In [None]:
class LLMBasicTokenizerV2:
  def __init__(self, corpus):
    processed_corpus = re.split(r'([,.;:?!_"()\']--|\s)', corpus)
    processed_corpus = [token.strip() for token in processed_corpus if token.strip()]

    vocab = sorted(set(processed_corpus))
    vocab.extend(["<|endoftext|>", "<|unk|>"])
    self.vocab_to_id = {word:i for i, word in enumerate(vocab)}
    self.id_to_vocab = {i:word for i, word in enumerate(vocab)}

  def encode(self, text):
    split_text = re.split(r'([,.;:?!_"()\']--|\s)', text)
    return [self.vocab_to_id[token.strip()] for token in split_text if token.strip() in self.vocab_to_id]

  def decode(self, tokens):
    joined_tokens = " ".join([self.id_to_vocab[token] for token in tokens])
    return re.sub(r'\s+([,.;:?!_"()\'])', r'\1', joined_tokens)


In [None]:
tokenizer = LLMBasicTokenizerV2(corpus)
text = """Timothy, can you please, please give me your name so I can see who you actually are."""
token_ids = tokenizer.encode(text)
decoded_text = tokenizer.decode(token_ids)
print(token_ids)
print(decoded_text)

[8371, 32687, 22807, 22805, 14878, 19774, 32705, 20703, 26968, 2365, 8371, 25732, 32086, 32687, 5284, 6145]
can you please, please give me your name so I can see who you actually are.
