|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>The Transformer block (code)<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # for 3D plotting

import torch
import torch.nn as nn
import torch.nn.functional as F

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Demo of linear separation after dimensionality expansion

In [None]:
# angles
n = 100
theta = np.linspace(0,2*np.pi-1/n,n)

# coordinates in 2D
x_inner = 1*np.cos(theta) + np.random.randn(n)/10
y_inner = 1*np.sin(theta) + np.random.randn(n)/10
x_outer = 2*np.cos(theta) + np.random.randn(n)/10
y_outer = 2*np.sin(theta) + np.random.randn(n)/10

# dimensionality-expansion via nonlinear transform
z_inner = np.sqrt(x_inner**2 + y_inner**2)
z_outer = np.sqrt(x_outer**2 + y_outer**2)



### 2D scatter plot
fig = plt.figure(figsize=(12,5))
ax0 = fig.add_subplot(121)

ax0.plot(x_inner,y_inner,'ko',markerfacecolor=[.7,.9,.7],markersize=9)
ax0.plot(x_outer,y_outer,'ks',markerfacecolor=[.9,.7,.7],markersize=9)
ax0.axis('square')
ax0.set(title='Non-linearly separable in 2D',xlabel='x',ylabel='y',
        xticklabels=[],yticklabels=[])

### 3D scatter plot
ax1 = fig.add_subplot(122, projection='3d')
ax1.plot(x_inner,y_inner,z_inner,'ko',markerfacecolor=[.7,.9,.7],markersize=9)
ax1.plot(x_outer,y_outer,z_outer,'ks',markerfacecolor=[.9,.7,.7],markersize=9)
ax1.set(title='Linearly separable in 3D',xlabel='x',ylabel='y',zlabel='Radius',
        xticklabels=[],yticklabels=[])
ax1.view_init(20,20)


plt.tight_layout()
plt.show()

# And now to the main part of the code :)

# Model hyperparameters

In [None]:
# data hyperparameters
seq_len = 8

# model hyperparameters
embed_dim = 128

# training hyperparameters
batch_size = 5

# One attention head

In [None]:
# create one attention head
class OneAttentionHead(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()

    # create the k/q/v matrices
    self.key   = nn.Linear(embed_dim,embed_dim,bias=False)
    self.query = nn.Linear(embed_dim,embed_dim,bias=False)
    self.value = nn.Linear(embed_dim,embed_dim,bias=False)
    self.W0    = nn.Linear(embed_dim,embed_dim,bias=False)

  def forward(self,x):

    # run the token embeddings vectors through attention
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    y = F.scaled_dot_product_attention(q,k,v,is_causal=True)
    y = self.W0(y) # linear weightings post-attention

    return y

In [None]:
# explore the attention head
onehead = OneAttentionHead(embed_dim)

print(onehead)

# run some fake data through
tokenEmbeds = torch.randn(batch_size, seq_len, embed_dim)
out = onehead(tokenEmbeds)
print(f'\nOutput ({out.shape}): \n{out}')

# Transformer block

In [None]:
#
class TransformerBlock(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()

    # attention sublayer
    self.layerNormAttn = nn.LayerNorm(embed_dim)
    self.attn = OneAttentionHead(embed_dim)

    # feedforward (MLP) sublayer
    self.layerNormMLP  = nn.LayerNorm(embed_dim)
    self.W1   = nn.Linear(embed_dim,4*embed_dim) # 4x expansion
    self.gelu = nn.GELU()                        # nonlinearity
    self.W2   = nn.Linear(4*embed_dim,embed_dim) # 4x contraction


  def forward(self,x):

    ## --- attention sublayer --- ##
    # save a copy of pre-attention data
    residual = x

    # layernorm -> attention
    h        = self.layerNormAttn(x)
    attn_out = self.attn(h)

    # combine pre-attention copy + attention adjustments
    x        = residual + attn_out

    # note: could do this in one line:
    #x = x + self.attn(self.layerNormAttn(x))
    ## -------------------------- ##



    ## ------ MLP sublayer ------ ##
    # copy of pre-MLP data
    residual2 = x

    # layernorm before MLP
    h2        = self.layerNormMLP(x)

    # expansion-nonlinearity-contraction
    mlp_out   = self.W2(self.gelu(self.W1(h2)))

    # combine pre-MLP copy + MLP-adjustment
    y         = residual2 + mlp_out
    ## -------------------------- ##


    return y

In [None]:
# create and explore an instance
transblock = TransformerBlock(embed_dim)
print(transblock)

In [None]:
# again, pushing data through
out = transblock(tokenEmbeds)
print(f'\nOutput ({out.shape}): \n{out}')