|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Words to tokens to numbers<h1>|
|<h2>Lecture:</h2>|<h1><b>Exploring ChatGPT4's tokenizer<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# matplotlib defaults
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# need to install the tiktoken library to get OpenAI's tokenizer
# note: it's tik-token, not tiktok-en :P
!pip install tiktoken
import tiktoken

In [None]:
# GPT-4's tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')
dir(tokenizer)

In [None]:
# get help
tokenizer??

In [None]:
# vocab size
tokenizer.n_vocab

In [None]:
tokenizer.decode([tokenizer.eot_token])

In [None]:
# but not all tokens are valid, e.g.,
print(tokenizer.n_vocab)
tokenizer.decode([100261])

In [None]:
# list of all tokens:
# https://github.com/vnglst/gpt4-tokens/blob/main/decode-tokens.ipynb

# Explore some tokens

In [None]:
for i in range(1000,1050):
  print(f'{i} = {tokenizer.decode([i])}')

# Tokenization!

In [None]:
text = "My name is Mike and I like toothpaste-flavored chocolate."
tokens = tokenizer.encode(text)
print(tokens)

In [None]:
text.split()

In [None]:
for word in text.split():
  print(f'"{word}" comprises token(s) {tokenizer.encode(word)}')

In [None]:
for t in tokens:
  print(f'Token {t:>6} is "{tokenizer.decode([t])}"')

In [None]:
# with special (non-ASCII) characters
tokenizer.encode('â')

# How long are the tokens?

In [None]:
# initialize lengths vector
token_lengths = np.zeros(tokenizer.n_vocab)

# get the number of characters in each token
for idx in range(tokenizer.n_vocab):
  try:
    token_lengths[idx] = len(tokenizer.decode([idx]))
  except:
    token_lengths[idx] = np.nan

# count unique lengths
uniqueLengths,tokenCount = np.unique(token_lengths,return_counts=True)



# visualize
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(token_lengths,'k.',markersize=3,alpha=.4)
axs[0].set(xlim=[0,tokenizer.n_vocab],xlabel='Token index',ylabel='Token length (characters)',
           title='GPT4 token lengths')

axs[1].bar(uniqueLengths,tokenCount,color='k',edgecolor='gray')
axs[1].set(xlim=[0,max(uniqueLengths)],xlabel='Token length (chars)',ylabel='Token count (log scale)',
           title='Distribution of token lengths')

plt.tight_layout()
plt.show()

# Many word-tokens start with spaces

In [None]:
# single-token words with vs. without spaces
print( tokenizer.encode(' Michael') )
print( tokenizer.encode('Michael') )

In [None]:
# multi-token words without a space
print( tokenizer.encode(' Peach') )
print( tokenizer.encode('Peach') )

In [None]:
peach = tokenizer.encode('Peach')
[tokenizer.decode([p]) for p in peach]

# The Time Machine book encoded

In [None]:
import requests
import re
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text

# split by punctuation
words = re.split(r'([,.:;—?_!"“()\']|--|\s)',text)
words = [item.strip() for item in words if item.strip()]
print(f'There are {len(words)} words.')
words[10000:10050]

In [None]:
# tokens of a random word in the text
someRandomWord = np.random.choice(words)
print(f'"{someRandomWord}" has token {tokenizer.encode(someRandomWord)}')

In [None]:
for t in words[:20]:
  print(f'"{t}" has {len(tokenizer.encode(t))} tokens')

In [None]:
for spelling in ['book','Book','bOok']:
  print(f'"{spelling}" has tokens {tokenizer.encode(spelling)}')

# But do we need to separate the text into words?

In [None]:
# what happens if we just tokenize the raw (unprocessed) text?
tmTokens = tokenizer.encode(text)
print(f'The text has {len(tmTokens):,} tokens and {len(words):,} words.')

In [None]:
# check out some tokens

for t in tmTokens[9990:10020]:
  print(f'Token {t:>6}: "{tokenizer.decode([t])}"')

In [None]:
print(tokenizer.decode(tmTokens[9990:10020]))