|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Math with tokens and embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# highres plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import GPT-2 model and extract its embeddings matrix

In [None]:
from transformers import GPT2Model,GPT2Tokenizer

# pretrained GPT-2 model and tokenizer
gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

embeddings = gpt2.wte.weight.detach().numpy()

# Exercise 1: Numbers to tokens

In [None]:
# create some numbers
numbers = np.arange(11)
numbers = np.concatenate( (numbers,10*numbers[2:],100*numbers[2:]), axis=0)

# initialize token vector
numTokenLabels = np.zeros(len(numbers))

# get and report the tokens
for i,n in enumerate(numbers):

  # get the first token for this number
  numTokenLabels[i] = tokenizer.encode(str(n))[0]

  # try /2
  print(f'The number {n:5} is token(s) {tokenizer.encode(str(n))}')

In [None]:
plt.figure(figsize=(10,4))
plt.plot(numbers,numTokenLabels,color=[.5,.5,.5],linewidth=.5)
plt.scatter(numbers,numTokenLabels,c=np.arange(len(numbers)),s=100,marker='s',cmap='plasma_r',zorder=10)

plt.gca().set(xlabel='Number (as string)',ylabel='Token value',xticks=numbers,xlim=[numbers[0]-15,numbers[-1]+15])
# plt.gca().set(xscale='log')
plt.show()

# Exercise 2: How long are numbers?

In [None]:
# tokenize integers and floating-point numbers

# initialize
numnums = 99_999
int_toklens = np.zeros(numnums,dtype=int)
float_toklens = np.zeros(numnums,dtype=int)

# random numbers
ra = 5*np.random.randn(numnums)

for i in range(numnums):

  # integers
  int_toklens[i] = len(tokenizer.encode(str(i)))

  # and the random numbers
  float_toklens[i] = len(tokenizer.encode(str(ra[i])))

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].plot(int_toklens+np.random.randn(numnums)/30,'s',markerfacecolor=[.7,.7,.9],alpha=.4)
axs[1].plot(ra,float_toklens+np.random.randn(numnums)/50,'o',markerfacecolor=[.7,.9,.7],alpha=.4)

axs[0].set(xlabel='Number',ylabel='Token length',yticks=range(int_toklens.max()+2),title='Token lengths of integers')
axs[1].set(xlabel='Number',ylabel='Token length',title='Token lengths of floating-point numbers')

plt.tight_layout()
plt.show()

# Exercise 3: Does math work in token conversions?

In [None]:
# the equation and its tokens
eq = '5 x 3 ='
tokens = tokenizer.encode(eq)

# try the math
print(f'{eq} -> {tokens}')
print(f'Product of tokens = {np.prod(tokens)}')
print(f'   which is "{tokenizer.decode(np.prod(tokens))}"')

In [None]:
# maybe just the numbers?
t5 = tokenizer.encode('5')
t3 = tokenizer.encode('3')

# let's see...
print(f'"5" and "3" have tokens {t5} and {t3}.')
print(f'Their product is {t5[0]*t3[0]}, which is "{tokenizer.decode(t5[0]*t3[0])}"')

# Exercise 4: How about embeddings?

In [None]:
# isolate the embedding vectors
e5 = embeddings[t5,:].squeeze()
e3 = embeddings[t3,:].squeeze()

# math
theirSum  = e3+e5
theirProd = e3*e5

# plot the vectors
plt.figure(figsize=(12,4))

plt.plot(e3,label='3')
plt.plot(e5,label='5')
plt.plot(theirSum,label='3+5')
plt.plot(theirProd,label='3x5')

plt.gca().set(xlabel='Embeddings dimension',ylabel='Value',xlim=[0,len(e3)])
plt.legend()
plt.show()

In [None]:
# unembedding as transpose of embeddings
sumUnembedding  = theirSum @ embeddings.T
prodUnembedding = theirProd @ embeddings.T

# print sizes
print(f'Summed vector X embeddings matrix = unembeddings vector')
print(f'    {theirSum.shape}    X    {embeddings.shape}   =  {sumUnembedding.shape}\n')

# find the argmax output
print(f'Max embedding of 5+3 = "{tokenizer.decode(np.argmax(sumUnembedding))}"')
print(f'Max embedding of 5x3 = "{tokenizer.decode(np.argmax(prodUnembedding))}"')

In [None]:
# curious to see how the embeddings for '3' and '5' compare
plt.plot(e5,e3,'.')
plt.show()