|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: How many parameters?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch

!pip install torchinfo
from torchinfo import summary

# svg plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# libraries for GPT models and their tokenizer
from transformers import AutoModelForCausalLM,GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Exercise 1: Import GPT2 models

In [None]:
# dictionary of modelname:identifier
model_ids = {
    'small':  'gpt2',        # 124M
    'medium': 'gpt2-medium', # 355M
    'large':  'gpt2-large',  # 774M
    'xl':     'gpt2-xl'      # 1.6B (including this model really slows things down...)
}

# load all models into a dictionary
models = {}
for name, id in model_ids.items():
  models[name] = AutoModelForCausalLM.from_pretrained(id)

In [None]:
models.keys()

In [None]:
# now you can iterate over them
for name, model in models.items():
  print(f'{sum():13,} parameters in gpt2-{name}')

# Exercise 2: Compare GPT2-small against "model 5"

In [None]:
# same data for all exercises
x = tokenizer.encode('This is a test. One that we have done countless times before.',return_tensors='pt')
x

In [None]:
# summary of model and parameters
sumry = summary(, input_data=x, col_names=['input_size','output_size','num_params'])
print(sumry)

In [None]:
finalHeadCount = # hint: .lm_head.weight

print(f'Total trainable parameters: { - :,}')

In [None]:
# show that token embedding and final output layer are the same
plt.figure(figsize=(8,4))

tokW = models['small'].
head = models['small'].

plt.plot(tokW,color=[.9,.7,.9],label='Token embeddings')
plt.plot(head,'o',markerfacecolor=[.9,.9,.7],markersize=3,label='Unembeddings')

plt.legend()
plt.gca().set(xlabel='Dimension',ylabel='Value',xlim=[0,len(tokW)],
              title=f'Correlation = {np.corrcoef(tokW,head)[0,1]:.3f}')

plt.show()

# Exercise 3: Weight and bias parameters, as percentage of total



In [None]:
for name,mat in models['small'].named_parameters():
  print(name)

In [None]:
# initialize
param_counts = np.zeros()

for idx,(modelname,model) in enumerate(models.items()):

  # get the summary for all parameters
  sumry =
  finalHeadCount =
  param_counts[idx,2] = sumry.total_params - finalHeadCount

  # loop through all parameters and increment the parameter count
  for layername,mat in model.named_parameters():
    if 'weight' in layername:
      param_counts
    elif 'bias' in layername:
      param_counts

  print(f'\n** Model "{modelname}":')
  print(f'Total weights: {param_counts[idx,0]:13,d} ({  :6.3f}% of all params)')
  print(f'Total biases:  {} ({  :6.3f}% of all params)')

# Exercise 4: Parameters in attention vs. MLP blocks

In [None]:
# initialize
param_counts = np.zeros((len(models.keys()),3),dtype=int)

for idx,(modelname,model) in enumerate(models.items()):

  # get the summary for all parameters


  # loop through all parameters and increment parameter count

  print(f'\n** Model "{modelname}" ({len(model.transformer.h)} transformer blocks):')
  print(f'Att weights: {param_counts[idx,0]:11,d} ({100*param_counts[idx,0]/param_counts[idx,2]:5.2f}% of all params)')
  print(f'MLP weights: {param_counts[idx,1]:11,d} ({100*param_counts[idx,1]/param_counts[idx,2]:5.2f}% of all params)')

In [None]:
plt.bar(np.arange(0,7,2)-.4,,label='Attention',color=[.85,.4,.02])
plt.bar(np.arange(0,7,2)+.4,,label='MLP',color=[.3,.01,.42])

plt.legend()
plt.gca().set(xticks=range(0,7,2),xticklabels=models.keys(),xlabel='GPT2 model version',
              ylabel='Percent of total model weights',title='Percentage weights per layer type')
plt.show()

# Exercise 5: How many layernorm parameters?