In [3]:
# this workbook shows the impact of layernorm and mlp on attention tensors with simplified examples

import torch
from model_details import LayerNorm, CausalSelfAttention, MLP, Block

class ToyConfig:
    block_size: int = 4
    vocab_size: int = 12 # this is a toy the vocab is limited to a handful of letters that make a bunch of 4 letter words
    n_layer: int = 1
    n_head: int = 1
    n_embd: int = 6
    dropout: float = 0.0
    bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    printModel: bool = True # True prints the model to debug

config = ToyConfig()
blk = Block(config)


In [5]:
# Define the given tensor
x = torch.randn(1, 4, 6)

print(x)

tensor([[[ 0.7238, -1.3490,  2.2708,  0.1644,  0.4643, -0.9596],
         [ 0.3577,  0.2595, -1.0803,  0.4079,  0.1390, -1.1237],
         [-1.9288, -0.8133,  0.6529,  0.1452,  0.1837, -2.2137],
         [ 0.0828, -1.8399, -0.7917, -0.6395,  0.5560, -0.2244]]])


In [6]:
y = blk(x) #Note since we have not done any training the output on this is relatively random
print ("Y : \n", y)

Y : 
 tensor([[[ 0.8407, -1.2674,  2.1852,  0.0537,  0.2427, -0.7833],
         [ 0.3849,  0.3378, -1.3973,  0.7551,  0.0442, -0.9987],
         [-1.5932, -0.7877,  0.3467,  0.4331, -0.2252, -1.9986],
         [ 0.1146, -1.6984, -0.8226, -0.3603,  0.4504,  0.0197]]],
       grad_fn=<AddBackward0>)


In [7]:
ln_1 = LayerNorm(config.n_embd, bias=config.bias)
attn = CausalSelfAttention(config)
ln_2 = LayerNorm(config.n_embd, bias=config.bias)
mlp = MLP(config)

In [8]:
print("before first layernorm: \n", x)
z = ln_1(x)
print("after first layernorm: \n", z)



before first layernorm: 
 tensor([[[ 0.7238, -1.3490,  2.2708,  0.1644,  0.4643, -0.9596],
         [ 0.3577,  0.2595, -1.0803,  0.4079,  0.1390, -1.1237],
         [-1.9288, -0.8133,  0.6529,  0.1452,  0.1837, -2.2137],
         [ 0.0828, -1.8399, -0.7917, -0.6395,  0.5560, -0.2244]]])
after first layernorm: 
 tensor([[[ 0.4272, -1.3272,  1.7365, -0.0463,  0.2075, -0.9977],
         [ 0.8019,  0.6537, -1.3697,  0.8778,  0.4716, -1.4354],
         [-1.1617, -0.1385,  1.2064,  0.7407,  0.7760, -1.4230],
         [ 0.7395, -1.8045, -0.4176, -0.2161,  1.3656,  0.3330]]],
       grad_fn=<NativeLayerNormBackward0>)


In [9]:
just_attn = attn(x)
ln_attn = attn(z)

print("X: \n", x)
print('Just Attention: \n', just_attn)
print('With layernorm: \n', ln_attn)


X: 
 tensor([[[ 0.7238, -1.3490,  2.2708,  0.1644,  0.4643, -0.9596],
         [ 0.3577,  0.2595, -1.0803,  0.4079,  0.1390, -1.1237],
         [-1.9288, -0.8133,  0.6529,  0.1452,  0.1837, -2.2137],
         [ 0.0828, -1.8399, -0.7917, -0.6395,  0.5560, -0.2244]]])
Just Attention: 
 tensor([[[ 0.1134, -0.0554, -0.0975, -0.2883,  0.3562,  0.4833],
         [ 0.1387, -0.0517, -0.0601, -0.1950,  0.3259,  0.3051],
         [ 0.1496, -0.0027, -0.0425, -0.3056,  0.3246,  0.3577],
         [ 0.1379,  0.0094, -0.0092, -0.3401,  0.2982,  0.3635]]],
       grad_fn=<UnsafeViewBackward0>)
With layernorm: 
 tensor([[[ 0.0598, -0.0578, -0.0833, -0.2837,  0.3234,  0.3881],
         [ 0.1718, -0.0576, -0.0611, -0.1994,  0.3682,  0.3163],
         [ 0.2174,  0.0009, -0.0398, -0.2464,  0.3312,  0.3657],
         [ 0.2003,  0.0136, -0.0110, -0.2572,  0.2743,  0.3961]]],
       grad_fn=<UnsafeViewBackward0>)
