In [None]:
import sys
from pathlib import Path

here = Path.cwd().resolve()
repo_root = here if (here / "src").exists() else here.parents[1]

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

In [None]:
import os
import urllib.request
import re

In [None]:
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

#### Main goal is to create a vocabulary i.e dictionary mapping a toke to token id
Steps:
<ul>
    <li> split the words into tokens using regex
    <li> create a mapping from toke to token id for the whole text -> vocabulary
</ul>

In [None]:
# Example of splitting text into words
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

In [None]:
# applying on the whole text
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
tokens = [item.strip() for item in tokens if item.strip()]
print(tokens[:10])

In [None]:
# create token ids in alphabetical order
vocab = {token: idx for idx, token in enumerate(sorted(set(tokens)))}

In [None]:
len(vocab)

In [None]:
type(vocab)

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
# extend a vocab with special tokens
all_tokens = sorted(list(set(tokens)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
from src.gpt_blocks.tokenizer import SimpleTokenizerV1

tokenizer_obj = SimpleTokenizerV1(vocab=vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer_obj.encode(text)

In [None]:
ids

In [None]:
# Hello and palace are not in the vocab, so they get mapped to <|unk|>
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

In [None]:
from src.gpt_blocks.tokenizer import SimpleTokenizerV2
tokenizer = SimpleTokenizerV2(vocab)

In [None]:
print(tokenizer.encode(text))

In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

In [None]:
import tiktoken 

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
strings = tokenizer.decode(integers)
print(strings)

In [None]:
print(tokenizer.encode("Akwirw ier"))

In [None]:
print(tokenizer.decode(tokenizer.encode("Akwirw ier")))