|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>Numerical issues in logits and softmax<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import GPT2-small and -large

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2_small = AutoModelForCausalLM.from_pretrained('gpt2')
gpt2_large = AutoModelForCausalLM.from_pretrained('gpt2-large')

# and the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Logits from both models for the same inputs

In [None]:
# get the outputs of the models
outputs_small = gpt2_small(tokenizer.encode('A plethora of platypuses.',return_tensors='pt'))
outputs_large = gpt2_large(tokenizer.encode('A plethora of platypuses.',return_tensors='pt'))

In [None]:
# grab the final token logit outputs
logits_small = outputs_small.logits[0,-1,:].detach()
logits_large = outputs_large.logits[0,-1,:].detach()

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(logits_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 SMALL')

# gpt2 large
axs[1].plot(logits_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 LARGE')

# against each other
axs[2].plot(logits_small,logits_large,'b.',alpha=.2)
axs[2].set(xlabel='GPT2 SMALL',ylabel='GPT2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.show()

# Manual softmax via direct implementation of the math

In [None]:
# manual softmax
sm_manual_small = torch.exp(logits_small) / torch.sum(torch.exp(logits_small))
sm_manual_large = torch.exp(logits_large) / torch.sum(torch.exp(logits_large))

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_manual_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 SMALL')

# gpt2 large
axs[1].plot(sm_manual_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 LARGE')

# against each other
axs[2].plot(sm_manual_small,sm_manual_large,'b.',alpha=.2)
axs[2].set(xlabel='GPT2 SMALL',ylabel='GPT2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.show()

In [None]:
# uh oh...
sm_manual_small

In [None]:
# wait but why?
logits_small_norm[10000]#.exp()
# logits_small.exp().sum()

In [None]:
n = -123.0
print('numpy:',np.exp(n))
print('torch:',torch.exp(torch.tensor(n)))

# Corrected softmax via normalization

In [None]:
# simple normalization (subtract max value)
logits_small_norm = logits_small - logits_small.max()
logits_large_norm = logits_large - logits_large.max()

# visualize
_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(logits_small_norm,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 SMALL')

# gpt2 large
axs[1].plot(logits_large_norm,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 LARGE')

# against each other
axs[2].plot(logits_small_norm,logits_large_norm,'b.',alpha=.2)
axs[2].set(xlabel='GPT2 SMALL',ylabel='GPT2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.show()

In [None]:
# now repeat the manual softmax
sm_manual_smallN = torch.exp(logits_small_norm) / torch.sum(torch.exp(logits_small_norm))
sm_manual_largeN = torch.exp(logits_large_norm) / torch.sum(torch.exp(logits_large_norm))

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_manual_smallN,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 SMALL')

# gpt2 large
axs[1].plot(sm_manual_largeN,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 LARGE')

# against each other
axs[2].plot(sm_manual_smallN,sm_manual_largeN,'b.',alpha=.2)
axs[2].set(xlabel='GPT2 SMALL',ylabel='GPT2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.show()

# Pytorch softmax function

In [None]:
# pytorch softmax
sm_torch_small = F.softmax(logits_small,dim=-1)
sm_torch_large = F.softmax(logits_large,dim=-1)

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_torch_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 SMALL')

# gpt2 large
axs[1].plot(sm_torch_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='GPT2 LARGE')

# against each other
axs[2].plot(sm_torch_small,sm_torch_large,'b.',alpha=.2)
axs[2].set(xlabel='GPT2 SMALL',ylabel='GPT2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.show()

In [None]:
F.softmax??