|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Words to tokens to numbers<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Tokenizing The Time Machine<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# typical libraries...
import numpy as np
import matplotlib.pyplot as plt

# for importing and working with texts
import requests
import re
import string

# adjust matplotlib defaults to personal preferences
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Get and prepare the text

In [None]:
# get raw text from internet
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text

# character strings to replace with space
strings2replace = [ '\r\n\r\nâ\x80\x9c','â\x80\x9c','â\x80\x9d','\r\n','â\x80\x94','â\x80\x99','â\x80\x98','_', ]

# use regular expression (re) to replace those strings with space
for str2match in strings2replace:
  text = re.compile(r'%s'%str2match).sub(' ',text)

# remove non-ASCII characters and numbers, and make lower-case
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
text = re.sub('\d+','',text).lower()

In [None]:
# split into words that contain >1 character
words = re.split(f'[{string.punctuation}\s]+',text)
words = [item.strip() for item in words if item.strip()]
words = [item for item in words if len(item)>1]

# create the vocab / lexicon
vocab = sorted(set(words))
nWords = len(words)
nLex = len(vocab)

In [None]:
# create the encoder/decoding mapping dictionaries
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for i,w in enumerate(vocab)}

In [None]:
# create encoder and decoder functions
def encoder(words,encode_dict):

  # loop through the words and find their token in the vocab
  idxs = np.zeros(len(words),dtype=int)
  for i,w in enumerate(words):
    idxs[i] = encode_dict[w]
  return idxs

# and the decoder function
def decoder(idxs,decode_dict):
  return ' '.join([decode_dict[i] for i in idxs])

# Exercise 2: A random walk through the Time Machine

In [None]:
# random tokens
randomTokens = np.random.randint(0,len(vocab),10)

# test with random token indices
print(f'Random tokens: \n\t{randomTokens}\n')
print(f'Decoded text: \n\t{decoder(randomTokens,idx2word)}')

In [None]:
# A brief aside on Brownian noise
brownNoise = np.cumsum(np.random.choice([-1,1],3000))

plt.figure(figsize=(10,3))
plt.plot(brownNoise,'k')
plt.gca().set(xlim=[0,len(brownNoise)],xlabel='"Time" (?)',ylabel='Signal amplitude',title='Brownian noise')
plt.show()

In [None]:
# Brownian noise
brownNoise = np.cumsum(np.random.choice([-1,1],30))
print(brownNoise)

BrownianRandomTokens = brownNoise + np.random.choice(nLex,1)
print(BrownianRandomTokens)
print('')

# test with random token indices
print(f'Brownian random tokens: \n\t{BrownianRandomTokens}\n')
print(f'Decoded text: \n\t{decoder(BrownianRandomTokens,idx2word)}')

# Exercise 3: Distribution of word lengths

In [None]:
# loop through the words and count the characters per word
numChars = np.zeros(nWords)
for i,w in enumerate(words):
  numChars[i] = len(w)

# now count the number of words with those characters
charCounts = np.zeros(int(np.max(numChars)))
for i in range(len(charCounts)):
  charCounts[i] = np.sum(numChars==i)


# and plot
_,axs = plt.subplots(2,1,figsize=(10,7))
axs[0].scatter(range(nWords),numChars,marker='.',s=10,c=np.linspace(.1,.9,len(numChars)),alpha=.4)
axs[0].set(yticks=range(1,int(np.max(numChars))),xlabel='Token index',xlim=[-15,nWords+15],
           ylabel='Number of characters',title='Character count by token index')

axs[1].bar(range(len(charCounts)),charCounts,edgecolor='k',color=[.9,.7,.9])
axs[1].set(xticks=range(1,len(charCounts)),xlim=[0,len(charCounts)],xlabel='Number of characters',
           ylabel='Token count',title='Histogram of character count frequencies')

plt.tight_layout()
plt.show()

# Exercise 4: Encode a novel sentence

In [None]:
# the text to decode
sentence = 'The space aliens came to Earth to steal watermelons and staplers.'

# preprocess (remove punctuation, make lower-case, split into words)
words_new = re.split(f'[,.\s]+',sentence.lower())

# remove empty items
words_new = [item.strip() for item in words_new if item.strip()]
words_new

In [None]:
# tokenize (uh oh...)
encoder(words_new,word2idx)

# Exercise 5: Create a new encoder

In [None]:
# need to update the vocab
word2idx_new = word2idx.copy()
idx2word_new = idx2word.copy()

# add an entry for unknown words
word2idx_new['<|unk|>'] = len(word2idx)+1
idx2word_new[len(idx2word)+1] = '<|unk|>'

In [None]:
# need a new encoder function
def encoder_new(words,encode_dict):

  # initialize a vector of numerical indices
  idxs = np.zeros(len(words),dtype=int)

  # loop through the words and find their token in the vocab
  for i,w in enumerate(words):
    if w in encode_dict:
      idxs[i] = encode_dict[w]
    else:
      idxs[i] = encode_dict['<|unk|>']

  # return the results!
  return idxs

  # note: could use list-comp:
  #return np.array([ encode_dict[w] if w in encode_dict else encode_dict['<|unk|>'] for w in words ])

In [None]:
# try again
tokenidx = encoder_new(words_new,word2idx_new)
tokenidx

In [None]:
# need a new decoder function?
decoder(tokenidx,idx2word_new)