|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>Multiple videos on learning embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# typical libraries...
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# for importing and working with texts
import requests
import re
import string

# pytorch stuff
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

!pip install torchinfo # not installed by default in colab
from torchinfo import summary

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# **Code below is for video "Create a data loader to train a model"**

## Import text and create dictionary

In [None]:
# get raw text from internet (The Time Machine... yeah I use it a lot :P  )
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text
# character strings to replace with space
strings2replace = [ '\r\n\r\nâ\x80\x9c','â\x80\x9c','â\x80\x9d','\r\n','â\x80\x94','â\x80\x99','â\x80\x98','_', ]

# use regular expression (re) to replace those strings with space
for str2match in strings2replace:
  text = re.compile(r'%s'%str2match).sub(' ',text)

# remove non-ASCII characters and numbers, and make lower-case
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
text = re.sub('\d+','',text).lower()

# split into words with >1 letter
words = re.split(f'[{string.punctuation}\s]+',text)
words = [item.strip() for item in words if item.strip()]
words = [item for item in words if len(item)>1]

# create the vocabulary (lexicon)
vocab  = sorted(set(words))
nWords = len(words)
nVocab = len(vocab)

# encoder/decoder look-up-tables (as python dictionaries)
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for i,w in enumerate(vocab)}

# show a few keys in the dictionary
print(f'The book contains {nWords:,} words, {nVocab:,} of which are unique and comprise the vocab.')
print(f'\n\nFirst 10 vocab words:\n',list(word2idx.keys())[:10])

In [None]:
# parameters for dataset
context_length = 8 # context length
stride = 2 # skipping

# initialize
inputs  = []
targets = []

# overlapping sequences of context_length
for i in range(0,nWords-context_length,stride):

  # get a few words
  in_seq   = words[i  : i+context_length  ]
  targ_seq = words[i+1: i+context_length+1]

  # append to the lists
  inputs.append([word2idx[w] for w in in_seq])
  targets.append([word2idx[w] for w in targ_seq])

print(inputs[123])
print(targets[123])

In [None]:
# a closer look:
print('Inputs: ',inputs[4])
print('Targets:',targets[4])
print('')
print('Inputs :',inputs[5])
print('Targets:',targets[5])
# this is what we need, although we need it in torch Dataset/DataLoader format

In [None]:
# we need each list to be a tensor
torch.tensor(inputs[4])

## Create a class for a dataset object

In [None]:
# create a class for a dataset
class WordDataset(Dataset):
  def __init__(self, text, word2idx, context_length=8, stride=4):

    # initialize
    self.inputs  = []
    self.targets = []
    self.word2idx = word2idx  # stored locally in the object

    # overlapping sequences of context_length
    for i in range(0,len(text)-context_length,stride):

      # get a few words
      in_seq   = text[i : i+context_length]
      targ_seq = text[i+1 : i+context_length+1]

      # append to the lists
      self.inputs.append(torch.tensor([word2idx[w] for w in in_seq]))
      self.targets.append(torch.tensor([word2idx[w] for w in targ_seq]))

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]


# create an instance!
context_length = 6 # context length
stride = 3 # skipping over tokens
text_dataset = WordDataset(words,word2idx,context_length,stride)

text_dataset[4]

## And a dataloader for training

In [None]:
# also need a dataloader
dataloader = DataLoader(
                text_dataset,
                batch_size = 32, # 2 for looking; 32 for training
                shuffle    = True,
                drop_last  = True
            )

# let's have a look at the indices
X,y = next(iter(dataloader))
print('Inputs:')
print(X), print('')

print('Targets:')
print(y), print('\n\n\n')

# and the words
print('Inputs in words (first batch):')
print([idx2word[item.item()] for item in X[0]])
print('')

print('Targets in words (first batch):')
print([idx2word[item.item()] for item in y[0]])

# **Code below is for video "Build a model to learn the embeddings"**

In [None]:
# exploring dimensionality based on vocab sizes

# vocab sizes
N = np.logspace(np.log10(1000),np.log10(100000),23)

# heuristic for non-LLM models like word2vec or glove:
embdim = np.sqrt(N)

# parameters for GPT2
gpt2dims = [ 50257,768 ]

plt.figure(figsize=(8,4))

# heuristic line
plt.plot(N,embdim,'ks-',markersize=8,markerfacecolor=[.9,.7,.7],label=r'$s = \sqrt{N}$')

# expected embedding dim for GPT2
plt.plot([gpt2dims[0],gpt2dims[0]],[0,np.sqrt(gpt2dims[0])],'k--',linewidth=1,label='Expected GPT2')
plt.plot([0,gpt2dims[0]],[np.sqrt(gpt2dims[0]),np.sqrt(gpt2dims[0])],'k--',linewidth=1)

# actual GPT2 embedding
plt.plot([gpt2dims[0],gpt2dims[0]],[0,gpt2dims[1]],'b:',linewidth=1,label='Actual GPT2')
plt.plot([0,gpt2dims[0]],[gpt2dims[1],gpt2dims[1]],'b:',linewidth=1)

plt.gca().set(xlabel='Vocab size',ylabel='Embeddings dimensions',
              xlim=[-100,N[-1]+2000],ylim=[0,None])
plt.legend()
plt.show()

## Create and explore an embedding layer

In [None]:
# dimensionality of embedding space (arbitrarily set to 100)
embeddingDimension = 100

# create a random embedding
embedding_layer = nn.Embedding(nVocab,embeddingDimension)

# let's see its size
embedding_layer.weight.shape

In [None]:
# what does it look like?

_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].imshow(embedding_layer.weight.detach().T,aspect='auto',vmin=-1,vmax=1)
axs[0].set(ylabel='Embedding dimension',xlabel='Token index',title='Entire embedding matrix')

# pick a word at random
aRandomWord = np.random.choice(vocab)

# plot its embedding
axs[1].plot(embedding_layer.weight.detach()[word2idx[aRandomWord],:],'ks',markerfacecolor=[.7,.9,.7])
axs[0].axvline(word2idx[aRandomWord],color='w',linestyle='--')
axs[1].set(xlabel='Embedding dimension',ylabel='Weight value',title=f'Embedding for "{aRandomWord}" (idx = {word2idx[aRandomWord]})')

plt.tight_layout()
plt.show()

In [None]:
# embeddings for closely related words
word1 = 'time'
word2 = 'machine'

# their embeddings
embed1 = embedding_layer.weight.detach()[word2idx[word1],:]
embed2 = embedding_layer.weight.detach()[word2idx[word2],:]

# cosine similiarity between them
cosSim = torch.dot(embed1,embed2)/(torch.norm(embed1)*torch.norm(embed2))

# vizualize
plt.plot(embed1,embed2,'ks',markerfacecolor=[.7,.9,.7],alpha=.6)
plt.gca().set(xlabel=f'Embedding for "{word1}"',ylabel=f'Embedding for "{word2}"',
              title=f'Cosine similarity: {cosSim:.3f}')
plt.show()

## Build the model

In [None]:
class EmbeddingModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(EmbeddingModel, self).__init__()

    # embedding layer
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    # linear layers
    self.linear1 = nn.Linear(context_size * embedding_dim, 128)
    self.linear2 = nn.Linear(128, vocab_size)

  def forward(self, inputs):

    # extract and flatten embeddings [batch_size, context_size * embedding_dim]
    embeds = self.embeddings(inputs).view(inputs.shape[0],-1)

    # fully connected layers
    out = F.relu(self.linear1(embeds))
    out = self.linear2(out)

    # log softmax for classification (note: NLLLoss expects logprobs as inputs)
    log_probs = F.log_softmax(out, dim=1)
    return log_probs


# create a model instance!
model = EmbeddingModel(vocab_size=nVocab, embedding_dim=embeddingDimension, context_size=context_length)
print(model)

# apply Xavier weight distribution
for param in model.parameters():
  if param.dim()>1: # also excludes biases
    nn.init.xavier_normal_(param)

In [None]:
# let's test the model

X,y = next(iter(dataloader))
modelOut = model(X)

print('Input to model:')
print(X), print('')

print(f'Output from model (size: {list(modelOut.detach().shape)}):')
print(modelOut)

In [None]:
# log soft-max output:
print(modelOut.detach()[0])
print('')

# shouldn't the sum be 1?
print(f'Log softmax sum = {modelOut.detach()[0].sum():.3f}')

# ah, it's *log* softmax :D
print(f'exp(log(softmax)) sum = {torch.exp(modelOut.detach()[0]).sum():.3f}')

In [None]:
# find the word with the highest probability
print('Model input:')
print([idx2word[w.item()] for w in X[0]])
print('')

print('Model output:')
print(idx2word[modelOut[0].argmax().item()])

In [None]:
plt.plot(modelOut[0].detach(),'o');

## Have the model generate text

In [None]:
# grab some data from the loader
X,y = next(iter(dataloader))

print('First input:')
print(' '.join([idx2word[w.item()] for w in X[0]]))
print('\nSubsequent inputs:')

# text generation
for _ in range(context_length):

  # get output for this input
  Y = model(X)

  # pick the most likely next word
  nextWord = Y[0].argmax().item()

  # create new input for the next iteration (word)
  X[0] = torch.concatenate((X[0][1:],torch.tensor([nextWord])))

  # print out the generated text so far
  print(' '.join([idx2word[w.item()] for w in X[0]]))

## How big is our model?

In [None]:
# summary of model and parameters
summary(model, input_data=X, col_names=['input_size','output_size','num_params'])

# **Code below is for video "Train and evaluate the model"**

In [None]:
# we'll use the GPU for speed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# create a fresh model instance
model = EmbeddingModel(vocab_size=nVocab, embedding_dim=embeddingDimension, context_size=context_length)

# with Xavier weight distribution
for param in model.parameters():
  if param.dim()>1: nn.init.xavier_normal_(param)


# and move it to the GPU
model = model.to(device)

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

In [None]:
# quick test for errors and sanity-check the output matrix sizes
X,y = next(iter(dataloader))
X,y = X.to(device), y.to(device)

# forward pass
modelOutput = model(X)

# check the sizes
print(f'Model input is of size: {X.shape}')
print(f'Target output is of size: {y.shape}')
print(f'Model output is of size: {modelOutput.shape}')

# loss function
loss = loss_function(modelOutput,y[:,-1])
print(f'\nLoss:')
loss

In [None]:
# extract the pretrained embedding weights for comparison later
pretrained_embeddings = model.embeddings.weight.detach().cpu()

In [None]:
# reminder: use batchsize=32 in data loader ;)

# Now train the model!

In [None]:
def trainTheModel(model,num_epochs=25):

  # initialize losses
  total_loss = np.zeros(num_epochs)

  for epoch in range(num_epochs):

    # initialize
    epoch_loss = 0

    # loop over batches in the data loader
    for X,y in dataloader:

      # move data to GPU
      X,y = X.to(device), y.to(device)

      # clear previous gradients
      model.zero_grad()

      # forward pass
      log_probs = model(X)

      # calculate the losses from the final target word
      loss = loss_function(log_probs,y[:,-1])

      # backprop
      loss.backward()
      optimizer.step()

      # sum the per-epoch losses
      epoch_loss += loss.item()

    # scale by the number of tokens in this dataloader
    total_loss[epoch] = epoch_loss / len(dataloader.dataset)

    # update our progress :)
    print(f'  Finished epoch {epoch+1} with loss {epoch_loss / len(dataloader.dataset):.4f}')

  # output the model and the losses
  return model,total_loss

In [None]:
# train the model!
model,total_loss = trainTheModel(model) # using default 25 epochs

# plot the losses
plt.figure(figsize=(10,3))
plt.plot(total_loss,'ks-',markerfacecolor='w',markersize=8)
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

In [None]:
# get the trained weights
postrained_embeddings = model.embeddings.weight.detach().cpu()

In [None]:
# Plenty of empty space so you won't look by accident :P

# **Code below is for video "CodeChallenge: How the embeddings change"**

## Exercise 1: distributions of embeddings

In [None]:
# histograms via numpy
yPre,xPre = np.histogram(pretrained_embeddings.flatten(),bins=50)
yPst,xPst = np.histogram(postrained_embeddings.flatten(),bins=50)

# recalculate x values as bin centers
xPre = (xPre[1:]+xPre[:-1]) / 2
xPst = (xPst[1:]+xPst[:-1]) / 2

# and plot
plt.figure(figsize=(12,5))
plt.bar(xPst,yPst,width=xPst[1]-xPst[0],color=[.7,.9,.7,.7],edgecolor=[.3,.6,0,.4],label='POSTtrain')
plt.bar(xPre,yPre,width=xPre[1]-xPre[0],color=[.9,.7,.7,.3],edgecolor=[.6,0,.3,.4],label='PREtrain')

plt.gca().set(xlabel='Weight value',ylabel='Count',title='Distributions of embedding weights')
plt.legend()
plt.show()

## Exercise 2: Embeddings vectors for a random word

In [None]:
# pick a word at random
aRandomWord = np.random.choice(vocab)
randWord_idx = word2idx[aRandomWord]

# extract its embeddings vectors
pre_ev = pretrained_embeddings[randWord_idx,:]
pst_ev = postrained_embeddings[randWord_idx,:]



_,axs = plt.subplots(2,2,figsize=(10,7))

# the pretrainined embeddings
axs[0,0].imshow(pretrained_embeddings.T,aspect='auto',vmin=-.2,vmax=.2)
axs[0,0].set(ylabel='Embedding dimension',xlabel='Token index',title='PREtrain embedding matrix')

# the post-trainined embeddings
axs[0,1].imshow(postrained_embeddings.T,aspect='auto',vmin=-.2,vmax=.2)
axs[0,1].set(ylabel='Embedding dimension',xlabel='Token index',title='POSTtrain embedding matrix')


axs[0,0].axvline(randWord_idx,linestyle='--',color=[.8,.8,.8])
axs[0,1].axvline(randWord_idx,linestyle='--',color=[.8,.8,.8])

# plot its embedding
axs[1,0].plot(pre_ev,'ks-',markerfacecolor=[.7,.9,.7],label='PREtrain')
axs[1,0].plot(pst_ev,'ko-',markerfacecolor=[.9,.7,.7],label='POSTtrain')
axs[1,0].set(xlabel='Embedding dimension',ylabel='Weight value',title=f'"{aRandomWord}" embedding (idx={randWord_idx})')
axs[1,0].legend(fontsize=8)

# how it changed
axlim = max(abs(pre_ev).max(),abs(pst_ev).max()) * 1.1 # equal axis limits
axs[1,1].plot(pre_ev,pst_ev,'ks',markerfacecolor=[.9,.7,.9])
axs[1,1].set(xlim=[-axlim,axlim],ylim=[-axlim,axlim],xlabel='PREtrain embedding',
             ylabel='POSTtrain embedding',title='Change in embedding')

plt.tight_layout()
plt.show()

## Exercise 3: Time Machine embeddings

In [None]:
# embeddings for closely related words
word1 = 'time'
word2 = 'machine'

# their embeddings
embed1pre = pretrained_embeddings[word2idx[word1],:]
embed2pre = pretrained_embeddings[word2idx[word2],:]
embed1pst = postrained_embeddings[word2idx[word1],:]
embed2pst = postrained_embeddings[word2idx[word2],:]

# cosine similarity between them
cosSim_pre = F.cosine_similarity(embed1pre.unsqueeze(dim=0),embed2pre.view(1,-1))
cosSim_pst = nn.functional.cosine_similarity(embed1pst.unsqueeze(dim=0),embed2pst.view(1,-1))


# vizualize
_,axs = plt.subplots(1,2,figsize=(10,4))

axlim = torch.cat((abs(embed1pre),abs(embed2pre),abs(embed1pst),abs(embed2pst))).max() * 1.1
axs[0].plot(embed1pre,embed2pre,'ks',markerfacecolor=[.9,.7,.7],alpha=.6)
axs[0].set(xlim=[-axlim,axlim],ylim=[-axlim,axlim],xlabel=f'"{word1}" embedding',
           ylabel=f'"{word2}" embedding',title=f'Cosine similarity PREtrain: {cosSim_pre.item():.3f}')

axs[1].plot(embed1pst,embed2pst,'ko',markerfacecolor=[.7,.9,.7])
axs[1].set(xlim=[-axlim,axlim],ylim=[-axlim,axlim],xlabel=f'"{word1}" embedding',
           ylabel=f'"{word2}" embedding',title=f'Cosine similarity POSTtrain: {cosSim_pst.item():.3f}')

plt.tight_layout()
plt.show()

In [None]:
# cosine similarity manually
num = torch.dot(embed1pst,embed2pst)
den = torch.norm(embed1pst)*torch.norm(embed2pst)
cs_man = num / den

# and via pytorch
cs_pyt = F.cosine_similarity(embed2pst.unsqueeze(dim=0),embed1pst.view(1,-1))

print(f'Cosine similarity from numpy: {cs_man:.4f}')
print(f'Cosine similarity from torch: {cs_pyt.item():.4f}')

# **Code below is for video "CodeChallenge: How stable are embeddings?"**

## Exercise 1: New training, different weights

In [None]:
# number of repetitions (new models) and epochs
numRepetitions = 10
numEpochs = 16

# initializations
lossesMatrix = np.zeros((numRepetitions,numEpochs))
embeddingsMats = []

# loop over repetitions
for repi in range(numRepetitions):

  # create a new model
  model = EmbeddingModel(vocab_size=nVocab, embedding_dim=embeddingDimension, context_size=context_length)
  model = model.to(device)
  for param in model.parameters():
    if param.dim()>1: nn.init.xavier_normal_(param)

  # need to recreate the optimizer b/c it retains gradient/overhead info
  optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

  # train the model
  print(f'** Running repetition {repi+1}/{numRepetitions} **')
  model,total_loss = trainTheModel(model,numEpochs)
  print('\n')

  # get its losses
  lossesMatrix[repi,:] = total_loss

  # extract the embedding matrix
  embeddingsMats.append( model.embeddings.weight.detach().cpu() )

In [None]:
# plot the losses
plt.figure(figsize=(10,4))

# in a for-loop to get different colors
for i in range(numRepetitions):
  plt.plot(lossesMatrix[i,:],'s-',linewidth=2,color=mpl.cm.plasma(i/numRepetitions),label=f'Rep. {i+1}')

plt.legend()
plt.gca().set(xticks=range(numEpochs),xlabel='Training epochs',ylabel='Losses')
plt.show()

## Exercise 2: Consistency of embedding vectors

In [None]:
_,axs = plt.subplots(3,2,figsize=(14,8))

# random words
randWords = np.random.choice(vocab,size=np.prod(axs.shape))

# but what do individual embeddings look like?
for wordi,ax in enumerate(axs.flatten()):

  # index of this random word
  randWord_idx = word2idx[randWords[wordi]]

  # loop over the repetitions
  for repi in range(numRepetitions):

    # draw lines for embeddings in each repetition
    ax.plot(embeddingsMats[repi][randWord_idx,:])

  # title of this subplot
  ax.set_title(f'"{randWords[wordi]}" embedding',fontsize=14)


# final adjustments
for a in axs.flatten(): a.set(xticks=[],yticks=[],xlim=[-1,embeddingDimension])
ax.set(xlabel='Embedding dimension',ylabel='Weight value')

plt.tight_layout()
plt.show()

## Exercise 3: Cosine similarity for selected word pairs

In [None]:
# pick three words
word1 = 'time'
word2 = 'machine'
word3 = 'she'

# vector of cosine similarity for each repetition
cossim = np.zeros((numRepetitions,3))

# and the for-loop!
for repi in range(numRepetitions):

  # their embeddings
  e1 = embeddingsMats[repi][word2idx[word1],:]
  e2 = embeddingsMats[repi][word2idx[word2],:]
  e3 = embeddingsMats[repi][word2idx[word3],:]

  # cosine similarity between each pair
  cossim[repi,0] = nn.functional.cosine_similarity(e1,e2.view(1,-1)).item()
  cossim[repi,1] = nn.functional.cosine_similarity(e1,e3.view(1,-1)).item()
  cossim[repi,2] = nn.functional.cosine_similarity(e2,e3.view(1,-1)).item()


# the plot
plt.figure(figsize=(10,4))
plt.plot(cossim[:,0],'ks',markerfacecolor=[.9,.7,.7],markersize=12,label=f'"{word1}" and "{word2}"')
plt.plot(cossim[:,1],'ko',markerfacecolor=[.7,.8,.7],markersize=12,label=f'"{word1}" and "{word3}"')
plt.plot(cossim[:,2],'k^',markerfacecolor=[.7,.7,.8],markersize=12,label=f'"{word2}" and "{word3}"')

plt.axhline(0,color=[.7,.7,.7],linestyle='--',zorder=-10)
plt.gca().set(xticks=range(numRepetitions),ylim=[-.4,.6],
              xlabel='Training repetition',ylabel='Cosine similarity',title='Cosine similarities')
plt.legend()

plt.show()