|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>Masked word prediction in BERT<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# An unmasked sentence in BERT

In [None]:
# text is paraphrased from https://en.wikipedia.org/wiki/Cubism
text = 'Cubism is an art movement that sparked innovations in music and architecture'
tokens = tokenizer.encode(text, return_tensors='pt')

for t in tokens[0]:
  print(f'Token {t:5} is "{tokenizer.decode(t)}"')

In [None]:
# forward pass
with torch.no_grad():
  outputs = model(tokens)

# find max-logit prediction for token 5
logits5 = outputs.logits[0,5,:]
maxlogit5 = torch.argmax(logits5)

# visualize (logit of token, not preceeding!)
plt.figure(figsize=(10,4))
plt.plot(maxlogit5,logits5[maxlogit5],'go',markersize=10)
plt.plot(logits5,'k.',alpha=.3)

plt.gca().set(title=f'Model prediction is "{tokenizer.decode(maxlogit5)}" (text is "{tokenizer.decode(tokens[0,5])}")',
              xlabel='Token index',ylabel='Model output logit',xlim=[-10,tokenizer.vocab_size+9])

plt.show()

# Using a mask

In [None]:
masked_text = 'Cubism is an [MASK] movement that sparked innovations in music and architecture'
masked_tokens = tokenizer.encode(masked_text, return_tensors='pt')

for t in masked_tokens[0]:
  print(f'Token {t:5} is "{tokenizer.decode(t)}"')

# index of the [MASK] token
mask_token_idx = torch.where(masked_tokens == tokenizer.mask_token_id)[1]
print(f'\nMask token is {tokenizer.mask_token_id} and is in index {mask_token_idx.item()}')

In [None]:
# forward pass
with torch.no_grad():
  outputs = model(masked_tokens)

# find max-logit prediction for masked token position
logitsMask = outputs.logits[0,mask_token_idx,:].squeeze()
maxlogitMask = torch.argmax(logitsMask)

# visualize (logit of token, not preceeding!)
plt.figure(figsize=(10,4))
plt.plot(maxlogitMask,logitsMask[maxlogitMask],'go',markersize=10)
plt.plot(logitsMask,'k.',alpha=.3)

plt.gca().set(title=f'Model prediction is "{tokenizer.decode(maxlogitMask)}" (text is "{tokenizer.decode(tokens[0,mask_token_idx])}")',
              xlabel='Token index',ylabel='Model output logit',xlim=[-10,tokenizer.vocab_size+9])

plt.show()

In [None]:
# print the top-10 predictions

# friendly reminder of the masked text
print(masked_text,'\n')

# get the top 10
vals,toks = torch.topk(logitsMask,10)

# print
for t,val in zip(toks,vals):
  print(f'Logit score of {val:.2f} for token "{tokenizer.decode(t)}"')

# Loop over all tokens to get predictions

In [None]:
# initialize
predicted_tokens = np.zeros(len(tokens[0]),dtype=int)

# loop over tokens, replace with [MASK], and get logits
for idx,tok in enumerate(tokens[0]):

  # make a copy and replace a token with mask
  masked_tokens = tokens.clone()
  masked_tokens[0,idx] = tokenizer.mask_token_id

  # confirmation:
  print([t.item() for t in masked_tokens[0]])

  # forward pass through the model
  with torch.no_grad(): outputs = model(masked_tokens)

  # get logits for the masked position
  mask_logits = outputs.logits[0,idx,:].squeeze()

  # get the max masked prediction and its z-score
  predicted_tokens[idx] = torch.argmax(mask_logits,dim=-1)

# Check the final results!

In [None]:
print('Original text:\n',text,'\n')
print('Predicted text:\n',' '.join([tokenizer.decode(t) for t in predicted_tokens[1:-1]]))

In [None]:
print('     ORIGINAL |  PREDICTED | EMBEDS CS')
print('-'*39)
for tidx in range(1,len(tokens[0])-2):

  target = tokens[0,tidx]
  prediction = predicted_tokens[tidx]

  # cosine similarity between the predicted and original token
  tokenE = model.bert.embeddings.word_embeddings.weight[target,:].detach()
  predE = model.bert.embeddings.word_embeddings.weight[prediction,:].detach()

  cs = torch.cosine_similarity(tokenE.unsqueeze(0),predE.unsqueeze(0))

  print(f' {tokenizer.decode(target):>12} | {tokenizer.decode(prediction):^10} |  {cs.item():.3f}')