In [1]:
import torch 
import torch.nn as nn

In [2]:
#example
#config for gpt-2 small model

GPT_CONFIG_124M={
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

In [3]:
#dummy gpt model class

#step 1:use a placeholder for transformer block

#step 2:use a placeholder for LayerNorm
import torch 
import torch.nn as nn
class DummyGPTModel(nn.Module):

    def __init__(self,cfg):#cgf-configuration of gpt-2 model
        super().__init__()
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb=nn.Dropout(cfg["drop_rate"])

        #use a placeholder for transformer block
        self.trf_blocks=nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        #use a placeholder for LayerNorm
        self.final_norm=DummyLayerNorm(cfg["emb_dim"])
        self.out_head=nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False
        )

    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape#batch_size-no of inputs,seq_len=length of no of tokens of each input in batch
        tok_embeds=self.tok_emb(in_idx)#token embeddings for input token ids each token id will have 768 dimensional token embedding
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device))#here arange is used for each input sequence there will be n number of tokens ,it is creating positional embedding vectors for n number of tokens.positional embedding created for one input sequence is used for all the other input sequences
        x=tok_embeds+pos_embeds
        #input token embeddings
        x=self.drop_emb(x)
        #dropout layer
        x=self.trf_blocks(x)
        #transformer block-implementing layernorm,multi-head attention,dropout layers
        x=self.final_norm(x)
        #final norm layer: shape until this step-(num_of_tokens_in_input_seq x number-of-embedding-dimension)
        logits=self.out_head(x)
        return logits#logitsdimension-(no_of_tokens_input_seq x vocab_size ) each row represents probability for each of the 50527 words to occur in that place

class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()

    
    def forward(self,x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self,cfg):
        super().__init__()

    
    def forward(self,x):
        return x

In [4]:
#Complete Transformer Block

class LayerNorm(nn.Module):

    def __init__(self,emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x +self.shift
    
#GELU Activation class
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x *(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi))*
            (x+0.044715*torch.pow(x,3))
        ) 

        )

# FeedForward Neural Network used inside Transformer blocks
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        emb = cfg["emb_dim"]
        hidden = 4 * emb   # Expanded dimension

        self.layers = nn.Sequential(
            # -------- EXPANSION --------
            # Increase embedding size from emb → 4*emb
            # Gives the model more capacity and richer feature space
            nn.Linear(emb, hidden),

            # -------- ACTIVATION --------
            # GELU adds non-linearity and helps the network learn complex patterns
            GELU(),

            # -------- CONTRACTION --------
            # Bring dimension back from 4*emb → emb
            # Keeps output compatible with the transformer's embedding size
            nn.Linear(hidden, emb)
        )

    def forward(self, x):
        return self.layers(x)


In [5]:
#improving multi head attention forward method by processing multiple heads sequentially
#here instaed of making multiple weight matrices for key,value and query we take one large matrix (for eg: if there are 2 heads with 2 separate weight matrix ,the output will be 2 different matrixes with dimension of (3x2) then add it along columns will result in (3x4) matrix 
# but here we are taking one large weight matrix with dimension of (3x4) and find query,key,value matrices and split them with num_heads(2) ,result in two 3x2 matrices)
import torch.nn as nn
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,num_heads,dropout,qkv_bias=False):
        super().__init__()
        assert (d_out%num_heads==0),"d_out must be divisible by num_heads"

        self.d_out=d_out
        self.num_heads=num_heads
        self.head_dim = d_out//num_heads #finding dimension of each head
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj=nn.Linear(d_out,d_out)
        self.dropout=nn.Dropout(dropout)
        self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))

    def forward(self,x):
        b,num_tokens,d_in=x.shape

        #shape for keys,queries,values matrix=(b,num_tokens,d_out)
        keys=self.W_key(x)
        queries=self.W_query(x)
        values=self.W_value(x)

        #split the larger matrices(keys,queries,values) according to num of heads
        #unroll last dimension to split the matrices according to no of heads:(b,num_tokens,d_out)->(b,num_tokens,num_heads,head_dim)
        keys=keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads,self.head_dim) 
        values=values.view(b,num_tokens,self.num_heads,self.head_dim) 

        #grouping according to num of heads by transposing
        #(b,num_tokens,num_heads,head_dim)->(b,num_heads,num_tokens,head_dim)
        keys=keys.transpose(1,2)
        queries=queries.transpose(1,2)
        values=values.transpose(1,2)

        #computing attention scores
        attn_scores=queries@keys.transpose(2,3)
        #here, each row i in each head represents the attention score of ith token with respect to all tokens in that head

        #implementing mask for upper diagonal
        mask_bool=self.mask.bool()[:num_tokens,:num_tokens]

        #masking the attention scores
        attn_scores.masked_fill_(mask_bool,-torch.inf)

        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)#keys.shape[1] refers to head_dim
        attn_weights=self.dropout(attn_weights)

        #context vector: shape->(b,num_tokens,num_heads,head_dim)
        context_vec=(attn_weights@values).transpose(1,2)

        #combine heads,where d_out=num_heads*head_dim
        context_vec=context_vec.contiguous().view(b,num_tokens,self.d_out)#values are stored sometimes non-contiguously in memory.if dimesnion changed on non-contiguously,it will lead to error.This method(contiguous) makes a new tensor with the same values but stored in a clean, contiguous memory block.
        context_vec=self.out_proj(context_vec)#optional projection layer

        return context_vec






In [6]:
#Transformer Block

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        #Multihead attention instance for converting embedding vectors into context vectors
        self.att=MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        #FeedForward Neural Network instance
        self.ff=FeedForward(cfg)
        #LayerNormalization instance 1 
        self.norm1=LayerNorm(cfg["emb_dim"])
        #LayerNormalization instance 2
        self.norm2=LayerNorm(cfg["emb_dim"])
        #Dropout layer
        self.drop_shortcut=nn.Dropout(cfg["drop_rate"])

    def forward(self,x):
        #input x is preserved for adding after output from first part of transformer
        shortcut=x
        #################Part 1 of transformer################
        #First input passing through Layer normalization layer 1
        x=self.norm1(x)
        #output from LayerNorm layer 1 passing through multi head attention
        x=self.att(x) # shape[batch_size,num_tokens,emb_size]
        #dropout layer
        x=self.drop_shortcut(x)
        #output is added with input(initial/original input)
        x=x+shortcut

        #################Part 2 of transformer################
        #input x(output from 1st part of transformer) is preserved for adding after output from 2nd part of transformer
        shortcut=x
        # input from 1st part of transformer passing through Layer normalization layer 1
        x=self.norm2(x)
        #ouput from LayerNorm 2nd layer is passed through feed forward neural network
        x=self.ff(x)
        #output from FeedForward NN is passed through dropout layer
        x=self.drop_shortcut(x)
        #ouput from above dropout layer is added with shortcut input(output of 1st part of transformer)
        x=x+shortcut

        return x


In [7]:


torch.manual_seed(123)
x=torch.rand(2,4,768)
block=TransformerBlock(GPT_CONFIG_124M)
output=block(x)
print("Input shape:",x.shape)
print("Output shape:",output.shape)
#same shape for input and output
#remember transformer block only doing operstions with layer normaliation,multi head attention ,dropout ,shortcut connections,feedforward neural network while keeping the dimension of input same as through the output
#but the output vectors contains a rich information about each token and  how its related with other words

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [8]:
#Complete GPT model class - 124M parameters

import torch 
import torch.nn as nn
class GPTModel(nn.Module):

    def __init__(self,cfg):#cgf-configuration of gpt-2 model
        super().__init__()
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb=nn.Dropout(cfg["drop_rate"])

        #use a placeholder for transformer block
        self.trf_blocks=nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        #use a placeholder for LayerNorm
        self.final_norm=LayerNorm(cfg["emb_dim"])
        self.out_head=nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False
        )

    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape#batch_size-no of inputs,seq_len=length of no of tokens of each input in batch
        tok_embeds=self.tok_emb(in_idx)#token embeddings for input token ids each token id will have 768 dimensional token embedding
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device))#here arange is used for each input sequence there will be n number of tokens ,it is creating positional embedding vectors for n number of tokens.positional embedding created for one input sequence is used for all the other input sequences
        x=tok_embeds+pos_embeds
        #input token embeddings
        x=self.drop_emb(x)
        #dropout layer
        x=self.trf_blocks(x)
        #transformer block-implementing layernorm,multi-head attention,dropout layers
        x=self.final_norm(x)
        #final norm layer: shape until this step-(num_of_tokens_in_input_seq x number-of-embedding-dimension)
        logits=self.out_head(x)
        return logits#logitsdimension-(no_of_tokens_input_seq x vocab_size ) each row represents probability for each of the 50527 words to occur in that place as next token to current token


In [9]:
torch.manual_seed(123)
model=GPTModel(GPT_CONFIG_124M)
batch=torch.tensor([[6109,3626,6100,345],
                    [6109,1110,6622,257]])
#2 input sequence with 4 token ids each
out=model(batch)
print("Input batch:\n",batch)
print("Output shape:",out.shape)
#here output shape is 3d vector for 2 inputs with each having 4 tokens and each token having 50257 dimensions(it contains probability of each of 50257 words to oocur as next word) 
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3612,  0.4223, -0.0709,  ...,  0.3479,  0.4655, -0.2833],
         [-0.1520, -0.5036, -0.8496,  ...,  0.0875,  0.5710, -0.3421],
         [ 0.7495,  0.0505,  0.0259,  ...,  0.0527, -0.4995, -0.1761],
         [-0.9089,  0.4484, -0.1124,  ...,  0.7924,  0.4422, -0.0015]],

        [[-0.2562,  0.0899,  0.0338,  ...,  0.2659,  0.4448, -0.6800],
         [ 0.1318,  0.4321, -0.1967,  ...,  0.8461,  0.2094,  0.1714],
         [ 1.0333,  1.0043, -0.2194,  ...,  0.6317,  0.3790, -0.2896],
         [-0.1300,  0.3927,  0.3873,  ...,  1.2650, -0.1869, -0.0020]]],
       grad_fn=<UnsafeViewBackward0>)


In [10]:
#calculating total no of parameters
total_params=sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
#here parameters is 163M which is greater than 124M because in initial GPT-2 Architecture , parameters used in token embeddings are reused in final output layer but here parameters are not reused that why 163M parameters
#proof is below

Total number of parameters: 163,009,536


In [11]:
print("Token embedding layer shape:",model.tok_emb.weight.shape)
print("Output layer shape:",model.out_head.weight.shape)

Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


In [12]:
#so remove the parameters count  reused in final output layer
total_params_gpt2=total_params-sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2}")
#now it is 124M parameters
#but using separete weights for token embedding layer and output layer is good for prediction . it is used in LLMs

Number of trainable parameters considering weight tying: 124412160


In [13]:
#space taken by the model
total_size_bytes=total_params*4
total_size_mb=total_size_bytes/(1024*1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


In [15]:
#generating text from output token

def generate_text_simple(model,idx,max_new_tokens,context_size):
    #idx is inputs(batch size,num_of tokens)
    #model- GPT-2 Model
    #max_new_tokens-no of tokens to be predicted

    for _ in range(max_new_tokens):
        #eg: if llm supports only 5 tokens, and the given context_size is 10
        #then only last 5 tokens are used as context
        idx_cond=idx[:,-context_size:]
        #getting token ids in an input sequence only until context size limit

        with torch.no_grad():
            logits=model(idx_cond)##shape-(no_of_batches,no_of_tokens_in_each_input,vocab_size)

        logits=logits[:,-1,:]#getting last row from each inputs of a btach

        #applying softmax for finding probabilties. dim=-1 because values are summed up along column for each row 
        probabs=torch.softmax(logits,dim=-1)

        #finding highest probabilty value index for each row 
        idx_next=torch.argmax(probabs,dim=-1,keepdim=True)


        idx = torch.cat((idx,idx_next),dim=1)#(batch,n_tokens+1)

    return idx


In [17]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")

In [None]:
start_context="Hello, I am"
encoded=tokenizer.encode(start_context)
print("encoded: ",encoded)
encoded_tensor=torch.tensor(encoded).unsqueeze(0)
#shape changed in shape of (batch,num_of_tokens)
print("encoded tensor shape: ", encoded_tensor.shape)

encoded:  [15496, 11, 314, 716]
encoded tensor shape:  torch.Size([1, 4])


In [None]:
model.eval()
out=generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:",out)
#here inputs are 4 tokens but output is 10 tokens
#started withed 4 tokens,it add 5th token and give 5 tokens as input and find 6th token like this iteratively do until max_new_tokens limit 
print("Output length :",len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 29739,   554]])
Output length : 10


In [None]:
decoded_text=tokenizer.decode(out.squeeze(0).tolist())#squeeze remove the 1st dimesnion here out-(1,10)-it removed 1st dimension(batch size dimension) and converted it into single list
print(decoded_text)
#the output is random because the model is not trained all the 124M parameters should be trained

Hello, I am Featureiman Byeswick Exit In
