In [1]:
import sys 
import os 
sys.path.append(os.path.abspath(".."))

In [6]:
import json
with open("../GPT_Model_Configuration/GPT_config_124M.json", "r") as f:
    GPT_CONFIG_124M = json.load(f)

In [7]:
from GPT.GPT_Model import TransformerBlock
import torch 
import torch.nn as nn

In [10]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768) 
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [11]:
from torchinfo import summary
summary(block)

Layer (type:depth-idx)                   Param #
TransformerBlock                         --
├─MultiHeadAttention: 1-1                --
│    └─Linear: 2-1                       590,592
│    └─Linear: 2-2                       590,592
│    └─Linear: 2-3                       590,592
│    └─Linear: 2-4                       590,592
│    └─Dropout: 2-5                      --
├─FeedForward: 1-2                       --
│    └─Sequential: 2-6                   --
│    │    └─Linear: 3-1                  2,362,368
│    │    └─GELU: 3-2                    --
│    │    └─Linear: 3-3                  2,360,064
├─LayerNorm: 1-3                         1,536
├─LayerNorm: 1-4                         1,536
├─Dropout: 1-5                           --
Total params: 7,087,872
Trainable params: 7,087,872
Non-trainable params: 0

In [12]:
import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim = 0)

In [13]:
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [14]:
print(batch.shape)

torch.Size([2, 4])


In [15]:
from GPT.GPT_Model import GPTModel 

In [16]:
GPT_CONFIG_1542M = {
    "vocab_size": 50257,    
    "context_length": 1024, 
    "emb_dim": 2048,        
    "n_heads": 32,          
    "n_layers": 24,         
    "drop_rate": 0.1,
    "qkv_bias": False       
}


In [17]:
model = GPTModel(GPT_CONFIG_1542M)

In [18]:
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
# print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])


In [19]:
print(summary(model))

Layer (type:depth-idx)                        Param #
GPTModel                                      --
├─Embedding: 1-1                              102,926,336
├─Embedding: 1-2                              2,097,152
├─Dropout: 1-3                                --
├─Sequential: 1-4                             --
│    └─TransformerBlock: 2-1                  --
│    │    └─MultiHeadAttention: 3-1           16,779,264
│    │    └─FeedForward: 3-2                  33,564,672
│    │    └─LayerNorm: 3-3                    4,096
│    │    └─LayerNorm: 3-4                    4,096
│    │    └─Dropout: 3-5                      --
│    └─TransformerBlock: 2-2                  --
│    │    └─MultiHeadAttention: 3-6           16,779,264
│    │    └─FeedForward: 3-7                  33,564,672
│    │    └─LayerNorm: 3-8                    4,096
│    │    └─LayerNorm: 3-9                    4,096
│    │    └─Dropout: 3-10                     --
│    └─TransformerBlock: 2-3                  --
│   

In [20]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 1,416,404,992


In [21]:
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embedding layer shape: torch.Size([50257, 2048])
Output layer shape: torch.Size([50257, 2048])


In [22]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 1,313,478,656


In [23]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 5403.16 MB


In [24]:
total_size_bytes = total_params * 4  # A
total_size_mb = total_size_bytes / (1024 * 1024)  # B
total_size_gb = total_size_mb / 1024  # C
print(f"Total size of the model: {total_size_gb:.4f} GB")

Total size of the model: 5.2765 GB


In [25]:
model

GPTModel(
  (tok_emb): Embedding(50257, 2048)
  (pos_emb): Embedding(1024, 2048)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=2048, out_features=2048, bias=False)
        (W_key): Linear(in_features=2048, out_features=2048, bias=False)
        (W_value): Linear(in_features=2048, out_features=2048, bias=False)
        (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=2048, out_features=8192, bias=True)
          (1): GELU()
          (2): Linear(in_features=8192, out_features=2048, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linea