Hands-on Code Implementation of the Chapter 03 of 'Build LLM From Scratch'.

In [3]:
import torch
from importlib.metadata import version
import random

print("torch version:", version("torch"))
torch.manual_seed(123)

torch version: 2.4.0


<torch._C.Generator at 0x1704db4c0f0>

In [4]:
# Example Input Tensor:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
print(inputs.shape)

torch.Size([6, 3])


### Attention Type 1: Self-Attention without trainable weights

In [3]:
#Given an input tensor (input embeddings) of "m" dimension, output context vector of size "n" dimension (m >,<,= n)

#Step 1: Attention Score
def attention_score(input_tensor,all=True):

    if all:
        print('Computing attention scores for all inputs: ')
        score = torch.zeros(input_tensor.shape)
        score = input_tensor @ input_tensor.T
        
        return score
    else:
        print('Computing attention scores for 1 input: ')
        query = random.choice(list(range(input_tensor.shape[0])))
        print('Randomly selected Query vector : Input ', query)
        q_vector = input_tensor[query]
        score = torch.empty(input_tensor.shape[0])
        for i, x_i in enumerate(input_tensor):
            score[i] = torch.dot(x_i,q_vector)
    
        return score, query

#Step 2: Attention weights
def attention_weight(score):
    
    print("Computing normalized attention weights: ")
    weights = torch.zeros(score.shape)
    weights = torch.softmax(score,dim=-1)
    print('SUM : ', weights.sum(dim=-1))
    print('Attention Weights: \n',weights)
    return weights

#Step 3: Context vectors
def context_vector(input_tensor,all=True):
    
    if all:
        att_score = attention_score(input_tensor,all)
        att_weight = attention_weight(att_score)
        print('Computing context vectors for ALL input: ')
        context = att_weight @ input_tensor
    
    else:
        att_score,query = attention_score(input_tensor,all)
        att_weight = attention_weight(att_score)
        print('Computing context vector for input: ', query)
        
        context = torch.zeros(input_tensor[query].shape)
        for i, x_i in enumerate(input_tensor):
            context += att_weight[i] * x_i

    return context

In [4]:
## TESTING:: 
context_all = context_vector(inputs,True)
print(context_all)
print('*********************\n')
context_1 = context_vector(inputs,False)
print(context_1)

Computing attention scores for all inputs: 
Computing normalized attention weights: 
SUM :  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])
Attention Weights: 
 tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Computing context vectors for ALL input: 
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])
*********************

Computing attention scores for 1 input: 
Randomly selected Query vector : Input  2
Computing normalized attention weights: 
SUM :  tensor(1.0000)
Attention Weights: 
 tensor([0.1390, 0.2369, 0.2326, 0.1242, 0.11

### Attention Type 2: Self-Attention with trainable weights (SCALED DOT PRODUCT ATTENTION)

In [5]:
#Given an input tensor (input embeddings) of "m" dimension, output context vector of size "n" dimension (m >,<,= n)

#Step 1: Define the 3 weight matrices: Q,K,V (add the bias parameter if using torch.nn.Linear ONLY)
def create_attention_matrices(dim_in, dim_out,qkv_bias=False):

    #USING torch.nn.Parameter:
    # W_query = torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=True)
    # W_key = torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=True)
    # W_value = torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=True)

    #USING torch.nn.Linear:
    W_query = torch.nn.Linear(dim_in,dim_out,bias=qkv_bias)
    W_key = torch.nn.Linear(dim_in,dim_out,bias=qkv_bias)
    W_value = torch.nn.Linear(dim_in,dim_out,bias=qkv_bias)

    return W_query, W_key, W_value

#Step 2: Project the input tensor through the 3 matrices to get the 3 vectors: query_vec, key_vector, value_vector
def create_input_vectors(input_tensor, dim_out=2,qkv_bias=False):

    dim_in = input_tensor.shape[-1]
    W_query,W_key, W_value = create_attention_matrices(dim_in,dim_out,qkv_bias=False)

    #USING torch.nn.Parameter:
    # Vec_query = input_tensor @ W_query
    # Vec_key = input_tensor @ W_key
    # Vec_value = input_tensor @ W_value

    #USING torch.nn.Linear:
    Vec_query = W_query(input_tensor)
    Vec_key = W_key(input_tensor)
    Vec_value = W_value(input_tensor)

    return Vec_query, Vec_key, Vec_value

#Step 3: Compute attention scores using query_vector and key_vector
def self_attention_score(input_tensor, dim_out=2,all=True,qkv_bias=False):
    
    Vec_query, Vec_key, Vec_value = create_input_vectors(input_tensor,dim_out,qkv_bias=False)
    
    if all:
        print('Computing attention scores for ALL inputs: ')
        att_score = Vec_query @ Vec_key.T
        
    else:
        print('Computing attention scores for 1 input: ')
        query = random.choice(list(range(input_tensor.shape[0])))
        print('Randomly selected Query vector : Input ', query)
        att_score = Vec_query[query] @ Vec_key.T
    
    return att_score, Vec_value

#Step 4: Normalize attention scores to get attention weights
def self_attention_weight(score,dim_k):
    print("Computing normalized attention weights: ")
    weights = torch.softmax(score /dim_k**0.5, dim=-1)
    print('SUM : ', weights.sum(dim=-1))
    print('Attention Weights: \n',weights)
    return weights

#Step 5: Compute context vectors using key_vector and attention weights
def context_vector(input_tensor,dim_out=2, all= True,qkv_bias=False):
    
    att_score, Vec_value = self_attention_score(input_tensor,dim_out,all,qkv_bias=False)
    dim_k = Vec_value.shape[-1]
    att_weight = self_attention_weight(att_score,dim_k)
    context = att_weight @ Vec_value
        
    return context

In [6]:
## TESTING:: 
context_all = context_vector(inputs,5,True,qkv_bias=False)
print(context_all)
print('*********************\n')
context_1 = context_vector(inputs,5,False,qkv_bias=False)
print(context_1)

Computing attention scores for ALL inputs: 
Computing normalized attention weights: 
SUM :  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)
Attention Weights: 
 tensor([[0.1640, 0.1621, 0.1618, 0.1738, 0.1626, 0.1758],
        [0.1699, 0.1542, 0.1545, 0.1765, 0.1732, 0.1716],
        [0.1699, 0.1543, 0.1546, 0.1765, 0.1730, 0.1717],
        [0.1691, 0.1592, 0.1594, 0.1719, 0.1723, 0.1681],
        [0.1694, 0.1599, 0.1598, 0.1726, 0.1663, 0.1720],
        [0.1690, 0.1576, 0.1580, 0.1728, 0.1751, 0.1675]],
       grad_fn=<SoftmaxBackward0>)
tensor([[ 0.5029,  0.3415, -0.5182, -0.2100,  0.2738],
        [ 0.4999,  0.3371, -0.5156, -0.2072,  0.2678],
        [ 0.4999,  0.3372, -0.5157, -0.2073,  0.2679],
        [ 0.5027,  0.3390, -0.5186, -0.2080,  0.2690],
        [ 0.5025,  0.3394, -0.5185, -0.2093,  0.2707],
        [ 0.5021,  0.3384, -0.5178, -0.2072,  0.2680]], grad_fn=<MmBackward0>)
*********************

Computing attention scores for 1 input

### Attention Type 3: Causal (Masked) and Sparse Self-Attention 

In [7]:
#Given an input tensor (input embeddings) of "m" dimension, output context vector of size "n" dimension (m >,<,= n)
#Build upon the previous self-attention scores and weights calculation:    
    
#Step 2: Depending upon normalization choice (softmax/regular), create the mask
def create_causal_mask(context_length,softmax=True):
    
    mask = torch.triu(torch.ones(context_length,context_length),diagonal=1)
    return mask

#Step 3: Apply mask on self-attention scores or weights to get masked weights
def causal_attention_weights(score_weight,context_length,softmax):
    
    mask = create_causal_mask(context_length,softmax)
    
    if softmax:
        masked_scores = score_weight.masked_fill(mask.bool(), -torch.inf)
    else:
        masked_scores = score_weight.masked_fill(mask.bool(),0)
    
    return masked_scores

#Step 6: Compute context vectors using key_vector and (sparse + masked) attention weights
def sparseCausal_context_vector(input_tensor,dim_out,all,softmax,qkv_bias=False):

    #Step 1: Load the self-attention scores and weights
    score_weight, Vec_value = self_attention_score(input_tensor,dim_out,all,qkv_bias)
        
    if not softmax:    
        dim_k = Vec_value.shape[-1]
        score_weight = self_attention_weight(score_weight,dim_k)
        
    context_length = input_tensor.shape[0]
    masked_scores = causal_attention_weights(score_weight,context_length,softmax)
    print('Masked Scores/Weights : \n', masked_scores)

    #Step 4: Normalize to get masked causal weights
    dim_k = Vec_value.shape[-1]
    
    if softmax:
        masked_weights = self_attention_weight(masked_scores,dim_k)
    else:
        row_sum = masked_scores.sum(dim=-1,keepdim=True)
        masked_weights = masked_scores/row_sum
    
    torch.manual_seed(123)
    #Step 5: Add Dropout to create sparcity in the masked causal weight matrix
    dropout = torch.nn.Dropout(0.5)
    sparse_maskedWeights = dropout(masked_weights)
    print('Dropout Weights: \n',sparse_maskedWeights )
    context = sparse_maskedWeights @ Vec_value

    return context

In [8]:
## TESTING:: 
torch.manual_seed(123)
context_all = sparseCausal_context_vector(inputs,5,True,True,qkv_bias=False)
print(context_all)
print('*********************\n')
# context_1 = sparseCausal_context_vector(inputs,5,True,False)
# print(context_1)

Computing attention scores for ALL inputs: 
Masked Scores/Weights : 
 tensor([[-0.2541,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.4388, -0.6557,    -inf,    -inf,    -inf,    -inf],
        [-0.4341, -0.6496, -0.6458,    -inf,    -inf,    -inf],
        [-0.2486, -0.3847, -0.3811, -0.2127,    -inf,    -inf],
        [-0.2277, -0.3571, -0.3577, -0.1863, -0.2684,    -inf],
        [-0.3094, -0.4661, -0.4605, -0.2595, -0.2306, -0.3302]],
       grad_fn=<MaskedFillBackward0>)
Computing normalized attention weights: 
SUM :  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)
Attention Weights: 
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5242, 0.4758, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3549, 0.3223, 0.3228, 0.0000, 0.0000, 0.0000],
        [0.2564, 0.2413, 0.2417, 0.2606, 0.0000, 0.0000],
        [0.2046, 0.1931, 0.1930, 0.2084, 0.2009, 0.0000],
        [0.1690, 0.1576, 0.1580, 0.1728, 0.1751, 0.1675]],
 

### Attention Type 4: Multi Head Causal Self-Attention

##### VERSION 1: Stacking multiple single-head attention layers:

In [9]:
#Given an input tensor (input embeddings) of "m" dimension, output context vector of size "n" dimension (m >,<,= n)
#Build upon the previous causal self-attention scores and weights calculation:   

def multiHead_context_vectors(input_tensor,heads,dim_out,all,softmax,qkv_bias=False):

    mha_context_vectors = []

    #Step 1: Get the different context vectors from each head:
    for i in range(heads):
        print('Head: ', i)
        context_vector = sparseCausal_context_vector(input_tensor,dim_out,all,softmax,qkv_bias=False)
        mha_context_vectors.append(context_vector)

        #Step 2: Concatenate all the heads output to create the final context vector:
        mha = torch.cat([i for i in mha_context_vectors],dim=-1)
    
    assert mha.shape[-1] == heads * dim_out
    return mha
    

In [10]:
## TESTING:: 
torch.manual_seed(123)
heads = 2
dim_out = 2
context_all = multiHead_context_vectors(inputs,heads,dim_out,True,True,qkv_bias=False)
print(context_all)
print('MHA Context Vector Shape: ', context_all.shape)
print('*********************\n')

Head:  0
Computing attention scores for ALL inputs: 
Masked Scores/Weights : 
 tensor([[0.3111,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1655, 0.2602,   -inf,   -inf,   -inf,   -inf],
        [0.1667, 0.2602, 0.2577,   -inf,   -inf,   -inf],
        [0.0510, 0.1080, 0.1064, 0.0643,   -inf,   -inf],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121,   -inf],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MaskedFillBackward0>)
Computing normalized attention weights: 
SUM :  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)
Attention Weights: 
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<SoftmaxBackw

##### VERSION 2: Multi head attention layers with weight splits:

In [6]:
#Given an input tensor (input embeddings) of "m" dimension, output context vector of size "n" dimension (m >,<,= n)
#Build upon the previous causal self-attention scores and weights calculation: 

def mha_causal_attention(input_tensor, heads, dim_out,dropout_rate, qkv_bias=False):

    #Step 1: Get the necessary details from the input tensor:
    batch, context_length, dim_in = input_tensor.shape #Shape: batch_size, 6,3

    #Step 2: Make sure the final output dimension and the number of heads are situable for attention operation:
    assert (dim_out % heads) == 0, \
            "d_out must be divisible by heads"
    
    #Step 3: Calculate the output dimension of each heads:
    dim_head = dim_out // heads
    print('Dimension of each heads: ', dim_head)
    print('Context length: ', context_length)
    print('Input embedding dimension detected: ', dim_in)
    print('Final context vector dimension (required): ', dim_out)

    #Step 4: Create the Q,K and V matrices:
    W_query = torch.nn.Linear(dim_in,dim_out,bias=qkv_bias) #Shape : 3x4
    W_key = torch.nn.Linear(dim_in,dim_out,bias=qkv_bias)
    W_value = torch.nn.Linear(dim_in, dim_out, bias=qkv_bias)
    
    #Step 5: Get the Q,K, and V projections
    Vec_query = W_query(input_tensor) #Shape: batch_size, 6,4
    Vec_key = W_key(input_tensor)
    Vec_value = W_value(input_tensor)

    print('Vec_query Shape before unrolling: ',Vec_query.shape) 
    
    #Step 6: Divide the original Q,K,V projections into smaller projections (each projection for each head). Attention will be computed on each of these smaller projections.
    Vec_query = Vec_query.view(batch,context_length, heads, dim_head) #Shape: b,6,2,2
    Vec_key = Vec_key.view(batch,context_length, heads, dim_head)
    Vec_value = Vec_value.view(batch,context_length, heads, dim_head)
    
    print('Vec_query Shape after unrolling: ', Vec_query.shape)

    #Step 7: Transform or Shuffle the dimensions of the smaller projections to make the tensors situable for attention.
    Vec_query = Vec_query.transpose(1,2) #Shape: b, 2, 6, 2
    Vec_key = Vec_key.transpose(1,2)
    Vec_value = Vec_value.transpose(1,2)

    print('Transposed unrolled Vec_query: ', Vec_query.shape)

    print('\nTensors ready for applying attention.. Yayyyy!!!!\n')

    #Step 8: Compute un-normalized self-attention score [The matrix multiplication is carried out between the 2 last dimensions (context_length, dim_head) 
                                                        # and then repeated for the individual heads ]
    attention_score = Vec_query @ Vec_key.transpose(2,3)   #Shape: Last 2 dim --> query: 6x2 and key.T: 2x6
    print('Shape of attention scores matrix: ', attention_score.shape)

    #Step 9: Create the mask for causal attention:
    mask = torch.triu(torch.ones(context_length,context_length), diagonal=1)
    
    #Step 10: Truncate the original mask to context length size and convert to boolean
    mask = mask.bool()[:context_length, :context_length]

    #Step 11: Apply the mask to get the masked un-normalized attention scores
    masked_scores = attention_score.masked_fill(mask, -torch.inf)

    #Step 12: Compute normalized and scaled attention weights
    dim_k = Vec_key.shape[-1]
    attention_weight = torch.softmax(masked_scores / dim_k**0.5, dim=-1)

    #Step 13: Create the dropout layer
    torch.manual_seed(123)
    dropout = torch.nn.Dropout(dropout_rate)

    #Step 14: Add the dropouts on the attention weights to get sparse causal attention weights
    sparse_attention_weights = dropout(attention_weight) #Shape: b,heads,6,6
    print('Shape of sparse attention weights matrix: ', sparse_attention_weights.shape)

    #Step 15: Compute the context vectors using the sparse attention weights and value vector
    context_vector = sparse_attention_weights @ Vec_value #Shape: last 2 dim --> weights: 6x6 and value: 6x2
    print('Shape of context vector: ', context_vector.shape) #Shape: b, heads, 6, 2 or b,2,6,2

    #Step 16: Perform re-transpositon of the context vector to make the tensor situable for rolling the last 2 dimension into 1 dimension (dim_out)
    context_vector = context_vector.transpose(1,2) #Shape: b,6,2,2

    #Step 17: Rolling the last two dimension back into 1 to make the tensor situable for final output: dim_out = heads * dim_head
    context_vector = context_vector.contiguous().view(batch,context_length,dim_out)

    #Step 18: Create the final linear projection layer
    final_projection = torch.nn.Linear(dim_out, dim_out)

    #Step 19: Perform the final projection to get the FINAL Context Vector
    context_vector = final_projection(context_vector)
    print('Final CONTEXT VECTOR SHAPE: ', context_vector.shape)

    #Step 20: Perform a final assertion check to make sure dim_out = heads * dim_head
    assert context_vector.shape[-1] == heads * dim_head
    print('\nMULTI-HEAD CAUSAL SELF-ATTENTION WITH WEIGHT SPLIT DONE... YAYYYYYYYYYYY!!!!!!!!!!!!!!!!')    

    return context_vector   

In [7]:
input_tensor = torch.stack((inputs, inputs), dim=0)
print('Shape of input tensor: ', input_tensor.shape)
dim_out = 4
heads = 2
dropout_rate = 0.0
mha_causal_attention(input_tensor, heads, dim_out,dropout_rate, qkv_bias=False)

Shape of input tensor:  torch.Size([2, 6, 3])
Dimension of each heads:  2
Context length:  6
Input embedding dimension detected:  3
Final context vector dimension (required):  4
Vec_query Shape before unrolling:  torch.Size([2, 6, 4])
Vec_query Shape after unrolling:  torch.Size([2, 6, 2, 2])
Transposed unrolled Vec_query:  torch.Size([2, 2, 6, 2])

Tensors ready for applying attention.. Yayyyy!!!!

Shape of attention scores matrix:  torch.Size([2, 2, 6, 6])
Shape of sparse attention weights matrix:  torch.Size([2, 2, 6, 6])
Shape of context vector:  torch.Size([2, 2, 6, 2])
Final CONTEXT VECTOR SHAPE:  torch.Size([2, 6, 4])

MULTI-HEAD CAUSAL SELF-ATTENTION WITH WEIGHT SPLIT DONE... YAYYYYYYYYYYY!!!!!!!!!!!!!!!!


tensor([[[-0.4007,  0.4685, -0.1581, -0.0175],
         [-0.3662,  0.4427, -0.1431, -0.0633],
         [-0.3570,  0.4283, -0.1400, -0.0803],
         [-0.3568,  0.4215, -0.1364, -0.0536],
         [-0.3571,  0.3630, -0.1364, -0.0600],
         [-0.3555,  0.3938, -0.1337, -0.0424]],

        [[-0.4007,  0.4685, -0.1581, -0.0175],
         [-0.3662,  0.4427, -0.1431, -0.0633],
         [-0.3570,  0.4283, -0.1400, -0.0803],
         [-0.3568,  0.4215, -0.1364, -0.0536],
         [-0.3571,  0.3630, -0.1364, -0.0600],
         [-0.3555,  0.3938, -0.1337, -0.0424]]], grad_fn=<ViewBackward0>)