In [None]:
#config for gpt-2 small model

GPT_CONFIG_124M={
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

In [2]:
#dummy gpt model class

#step 1:use a placeholder for transformer block

#step 2:use a placeholder for LayerNorm
import torch 
import torch.nn as nn
class DummyGPTModel(nn.Module):

    def __init__(self,cfg):#cgf-configuration of gpt-2 model
        super().__init__()
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb=nn.Dropout(cfg["drop_rate"])

        #use a placeholder for transformer block
        self.trf_blocks=nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        #use a placeholder for LayerNorm
        self.final_norm=DummyLayerNorm(cfg["emb_dim"])
        self.out_head=nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False
        )

    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape#batch_size-no of inputs,seq_len=length of no of tokens of each input in batch
        tok_embeds=self.tok_emb(in_idx)#token embeddings for input token ids each token id will have 768 dimensional token embedding
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device))#here arange is used for each input sequence there will be n number of tokens ,it is creating positional embedding vectors for n number of tokens.positional embedding created for one input sequence is used for all the other input sequences
        x=tok_embeds+pos_embeds
        #input token embeddings
        x=self.drop_emb(x)
        #dropout layer
        x=self.trf_blocks(x)
        #transformer block-implementing layernorm,multi-head attention,dropout layers
        x=self.final_norm(x)
        #final norm layer: shape until this step-(num_of_tokens_in_input_seq x number-of-embedding-dimension)
        logits=self.out_head(x)
        return logits#logitsdimension-(no_of_tokens_input_seq x vocab_size ) each row represents probability for each of the 50527 words to occur in that place

class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()

    
    def forward(self,x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self,cfg):
        super().__init__()

    
    def forward(self,x):
        return x

In [3]:
# Step 1: Tokenization

import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")
batch=[]
txt1="Every effort moves you"#for each input , 4 inputs and 4 prediction tasks happens here because of 4 tokens in this input sequence
txt2="Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch=torch.stack(batch,dim=0)
print(batch)
#a single batch contains token ids for two separate input sequences

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [4]:
# Step 2: create a dummy instance of dummy gpt model
torch.manual_seed(123)
model=DummyGPTModel(GPT_CONFIG_124M)
logits=model(batch)
print("Output shape: ",logits.shape)
#the result is a 3d vector for 2 inputs sequences with 4 rows(tokens) each and each row(token)  having the probabilities for all the 50256 tokens
print(logits)

Output shape:  torch.Size([2, 4, 50256])
tensor([[[ 0.4128, -0.8998, -0.5579,  ...,  0.4135,  0.3933, -1.0613],
         [ 0.5321,  0.4538, -0.3093,  ...,  1.1302, -0.4263, -1.3800],
         [-0.4274, -0.0532,  1.5788,  ...,  0.6545,  0.4722,  1.7405],
         [-1.1740,  0.2963,  1.8822,  ...,  0.0439,  0.0202,  0.0705]],

        [[-0.4007, -1.3970, -0.2163,  ...,  0.4030,  0.6206, -0.7461],
         [ 0.8890, -0.4229, -0.0288,  ...,  1.2841, -0.7089, -0.7428],
         [ 0.8619, -0.0722,  1.6096,  ...,  0.6957,  0.2457, -0.4580],
         [-0.5883,  0.0320, -0.5026,  ..., -0.3743,  0.4036,  1.0022]]],
       grad_fn=<UnsafeViewBackward0>)


In [5]:
#Layer Normalization-Simple example

torch.manual_seed(123)
batch_example=torch.randn(2,5)#a btch of 2 inputs sequences with each input sequence having 5 inputs
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())#6 outputs for each input sequence with 5 inputs each
out=layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [6]:
#finding mean and variance
mean=out.mean(dim=-1,keepdim=True)#dim=-1 because finding mean with all values of columns in a row
var=out.var(dim=-1,keepdim=True)
print("Mean: ",mean)
print("Variance: ",var) 
#each row is a mean or variance for that input sequence

Mean:  tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:  tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [7]:
#applying normalization
out_norm=(out-mean)/torch.sqrt(var)
mean=out_norm.mean(dim=-1,keepdims=True)
var=out_norm.var(dim=-1,keepdims=True)
print("Normalized layer outputs:\n",out_norm)
print("Mean: ",mean)
print("Variance: ",var)
#after normalization,mean for each input sequence is almost zero and variance is one also

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:  tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [8]:
#here we are getting perfect values ,even though the values are close to zero, turn off scientific mode
torch.set_printoptions(sci_mode=False)
print("Mean: ",mean)
print("Variance: ",var)
#now mean=0 and variance=1 for all input sequences

Mean:  tensor([[0.0000],
        [0.0000]], grad_fn=<MeanBackward1>)
Variance:  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [9]:
#Layer Normalization Class
class LayerNorm(nn.Module):

    def __init__(self,emb_dim):
        super().__init__()
        self.eps=1e-5
        #eps-to prevent division by zero while normalization in denominator
        self.scale=nn.Parameter(torch.ones(emb_dim))
        #scale 
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self,x):
        #here x is input: rows-context size ,cols-no of embeddings dimensions for each token
        mean=x.mean(dim=-1,keepdims=True)
        var=x.var(dim=-1,keepdims=True,unbiased=False)
        #here if unbiased=true, we want to apply bessel variance,divide by n-1 instead of n while calculating variance.this is same implementation for gpt models
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift
    
# Step 1: Normalization = ironing clothes

# You flatten everything, remove wrinkles, make all shirts look neat.

# But now all shirts look too similar.

# Step 2: scale = resizing the shirt

# Make it tighter or looser.

# Step 3: shift = moving the shirt up/down

# Adjust the position so it fits perfectly.

In [10]:
ln=LayerNorm(emb_dim=5)
out_ln=ln(batch_example)
mean=out_ln.mean(dim=-1,keepdims=True)
var=out_ln.var(dim=-1,keepdims=True,unbiased=False)
print("Mean:\n",mean)
print("Variance:\n",var)

Mean:
 tensor([[-0.0000],
        [ 0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [11]:
#GELU Activation class
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x *(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi))*
            (x+0.044715*torch.pow(x,3))
        )

        )

In [12]:
# FeedForward Neural Network used inside Transformer blocks
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        emb = cfg["emb_dim"]
        hidden = 4 * emb   # Expanded dimension

        self.layers = nn.Sequential(
            # -------- EXPANSION --------
            # Increase embedding size from emb → 4*emb
            # Gives the model more capacity and richer feature space
            nn.Linear(emb, hidden),

            # -------- ACTIVATION --------
            # GELU adds non-linearity and helps the network learn complex patterns
            GELU(),

            # -------- CONTRACTION --------
            # Bring dimension back from 4*emb → emb
            # Keeps output compatible with the transformer's embedding size
            nn.Linear(hidden, emb)
        )

    def forward(self, x):
        return self.layers(x)


In [13]:
ffn=FeedForward(GPT_CONFIG_124M)
x=torch.randn(2,3,768)#a btch with 2 inputs each input having 3 tokens and each token with embedding dimension of 768
out=ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


In [None]:
#adding shortcut connections for forward pass

class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self,layer_sizes,use_shortcut):
        super().__init__()
        self.use_shortcut=use_shortcut
        #layers is a deep neural network with 5 layers and each layer have its input and output neuron size and GELU Activation
        self.layers=nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0],layer_sizes[1]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1],layer_sizes[2]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2],layer_sizes[3]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3],layer_sizes[4]),GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4],layer_sizes[5]),GELU())

        ])

    def forward(self,x):
        for layer in self.layers:
            #compute the output of cuurent layer
            layer_output=layer(x)

            #applying shortcut and also check if input dimesion is equal to the layer output dimesnion
            if self.use_shortcut and x.shape==layer_output.shape:
                x=x+layer_output
            else:
                x=layer_output
        return x


In [20]:
layer_sizes=[3,3,3,3,3,1]#indicate no of neurons in each layer
sample_input=torch.tensor([1.,0.,-1.])
torch.manual_seed(123)
model_without_shortcut=ExampleDeepNeuralNetwork(
    layer_sizes,use_shortcut=False
)

In [None]:
def print_gradients(model,x):
    #Forward pass
    output=model(x)
    target=torch.tensor([[0.]])

    #calculate loss based on how close the target and output are
    loss=nn.MSELoss()
    loss=loss(output,target)
    #backward pass
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            # print the mean absolute gradient of weights in each layer(each layer will have a 3x3 gradient matrix)
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [21]:
print_gradients(model_without_shortcut,sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


  return F.mse_loss(input, target, reduction=self.reduction)


In [22]:
torch.manual_seed(123)
model_with_shortcut=ExampleDeepNeuralNetwork(
    layer_sizes,use_shortcut=True
)
print_gradients(model_with_shortcut,sample_input)

layers.0.0.weight has gradient mean of 0.22169792652130127
layers.1.0.weight has gradient mean of 0.20694106817245483
layers.2.0.weight has gradient mean of 0.32896995544433594
layers.3.0.weight has gradient mean of 0.2665732502937317
layers.4.0.weight has gradient mean of 1.3258541822433472


  return F.mse_loss(input, target, reduction=self.reduction)


In [24]:
#Complete Transformer Block

class LayerNorm(nn.Module):

    def __init__(self,emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x +self.shift
    
#GELU Activation class
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x *(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi))*
            (x+0.044715*torch.pow(x,3))
        ) 

        )

# FeedForward Neural Network used inside Transformer blocks
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        emb = cfg["emb_dim"]
        hidden = 4 * emb   # Expanded dimension

        self.layers = nn.Sequential(
            # -------- EXPANSION --------
            # Increase embedding size from emb → 4*emb
            # Gives the model more capacity and richer feature space
            nn.Linear(emb, hidden),

            # -------- ACTIVATION --------
            # GELU adds non-linearity and helps the network learn complex patterns
            GELU(),

            # -------- CONTRACTION --------
            # Bring dimension back from 4*emb → emb
            # Keeps output compatible with the transformer's embedding size
            nn.Linear(hidden, emb)
        )

    def forward(self, x):
        return self.layers(x)


In [25]:
#improving multi head attention forward method by processing multiple heads sequentially
#here instaed of making multiple weight matrices for key,value and query we take one large matrix (for eg: if there are 2 heads with 2 separate weight matrix ,the output will be 2 different matrixes with dimension of (3x2) then add it along columns will result in (3x4) matrix 
# but here we are taking one large weight matrix with dimension of (3x4) and find query,key,value matrices and split them with num_heads(2) ,result in two 3x2 matrices)
import torch.nn as nn
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,num_heads,dropout,qkv_bias=False):
        super().__init__()
        assert (d_out%num_heads==0),"d_out must be divisible by num_heads"

        self.d_out=d_out
        self.num_heads=num_heads
        self.head_dim = d_out//num_heads #finding dimension of each head
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj=nn.Linear(d_out,d_out)
        self.dropout=nn.Dropout(dropout)
        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))

    def forward(self,x):
        b,num_tokens,d_in=x.shape

        #shape for keys,queries,values matrix=(b,num_tokens,d_out)
        keys=self.W_key(x)
        queries=self.W_query(x)
        values=self.W_value(x)

        #split the larger matrices(keys,queries,values) according to num of heads
        #unroll last dimension to split the matrices according to no of heads:(b,num_tokens,d_out)->(b,num_tokens,num_heads,head_dim)
        keys=keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads,self.head_dim) 
        values=values.view(b,num_tokens,self.num_heads,self.head_dim) 

        #grouping according to num of heads by transposing
        #(b,num_tokens,num_heads,head_dim)->(b,num_heads,num_tokens,head_dim)
        keys=keys.transpose(1,2)
        queries=queries.transpose(1,2)
        values=values.transpose(1,2)

        #computing attention scores
        attn_scores=queries@keys.transpose(2,3)
        #here, each row i in each head represents the attention score of ith token with respect to all tokens in that head

        #implementing mask for upper diagonal
        mask_bool=self.mask.bool()[:num_tokens,:num_tokens]

        #masking the attention scores
        attn_scores.masked_fill_(mask_bool,-torch.inf)

        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)#keys.shape[1] refers to head_dim
        attn_weights=self.dropout(attn_weights)

        #context vector: shape->(b,num_tokens,num_heads,head_dim)
        context_vec=(attn_weights@values).transpose(1,2)

        #combine heads,where d_out=num_heads*head_dim
        context_vec=context_vec.contiguous().view(b,num_tokens,self.d_out)#values are stored sometimes non-contiguously in memory.if dimesnion changed on non-contiguously,it will lead to error.This method(contiguous) makes a new tensor with the same values but stored in a clean, contiguous memory block.
        context_vec=self.out_proj(context_vec)#optional projection layer

        return context_vec






In [26]:
#Transformer Block

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        #Multihead attention instance for converting embedding vectors into context vectors
        self.att=MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        #FeedForward Neural Network instance
        self.ff=FeedForward(cfg)
        #LayerNormalization instance 1 
        self.norm1=LayerNorm(cfg["emb_dim"])
        #LayerNormalization instance 2
        self.norm2=LayerNorm(cfg["emb_dim"])
        #Dropout layer
        self.drop_shortcut=nn.Dropout(cfg["drop_rate"])

    def forward(self,x):
        #input x is preserved for adding after output from first part of transformer
        shortcut=x
        #################Part 1 of transformer################
        #First input passing through Layer normalization layer 1
        x=self.norm1(x)
        #output from LayerNorm layer 1 passing through multi head attention
        x=self.att(x) # shape[batch_size,num_tokens,emb_size]
        #dropout layer
        x=self.drop_shortcut(x)
        #output is added with input(initial/original input)
        x=x+shortcut

        #################Part 2 of transformer################
        #input x(output from 1st part of transformer) is preserved for adding after output from 2nd part of transformer
        shortcut=x
        # input from 1st part of transformer passing through Layer normalization layer 1
        x=self.norm2(x)
        #ouput from LayerNorm 2nd layer is passed through feed forward neural network
        x=self.ff(x)
        #output from FeedForward NN is passed through dropout layer
        x=self.drop_shortcut(x)
        #ouput from above dropout layer is added with shortcut input(output of 1st part of transformer)
        x=x+shortcut

        return x


In [None]:
#example
torch.manual_seed(123)
x=torch.rand(2,4,768)
block=TransformerBlock(GPT_CONFIG_124M)
output=block(x)
print("Input shape:",x.shape)
print("Output shape:",output.shape)
#same shape for input and output
#remember transformer block only doing operstions with layer normaliation,multi head attention ,dropout ,shortcut connections,feedforward neural network while keeping the dimension of input same as through the output
#but the output vectors contains a rich information about each token how its related with other words

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])
