In [56]:
import torch
from torch import nn
import torch.nn.functional as F
import math

# Configuration class for GPT model hyperparameters

In [57]:
class GPTConfig:
    attn_dropout = 0.1
    embed_dropout = 0.1
    ff_dropout = 0.1

    def __init__(self, vocab_size, max_len, **kwargs):
        self.vocab_size = vocab_size
        self.max_len = max_len
        # Additional hyperparameters can be set dynamically
        for key, value in kwargs.items():
            setattr(self, key, value)

# GPT1Config extends GPTConfig with specific hyperparameters
class GPT1Config(GPTConfig):
    num_heads = 12
    num_blocks = 12
    embed_dim = 768

In [58]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach()


#Main GPT model class

In [59]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Define layers of the GPT model using ModuleDict
        self.layers = nn.ModuleDict({
            'tok_embed': nn.Embedding(config.vocab_size, config.embed_dim),
            'pos_embed': PositionalEncoding(config.embed_dim, config.max_len),
            'dropout': nn.Dropout(config.embed_dropout),
            'blocks': nn.Sequential(*[Block(config) for _ in range(config.num_blocks)]),
            'ln': nn.LayerNorm(config.embed_dim),
            'fc': nn.Linear(config.embed_dim, config.vocab_size)
        })
        # Initialize weights using Xavier uniform initialization
        self.apply(self.weights_init)

    def forward(self, x):
      # Forward pass through the GPT model
        tok_embedding = self.layers['tok_embed'](x)
        pos_embedding = self.layers['pos_embed'](tok_embedding)
        x = self.layers['dropout'](tok_embedding + pos_embedding)
        x = self.layers['blocks'](x)
        x = self.layers['ln'](x)
        x = self.layers['fc'](x)
        return x

    @staticmethod
    def weights_init(module):
      # Initialize weights of Linear and Embedding layers using Xavier uniform initialization
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.xavier_uniform_(module.weight.data)
             # Initialize bias to zero if present
            if hasattr(module, 'bias') and module.bias is not None:
                nn.init.constant_(module.bias.data, 0)



# Transformer block class

In [60]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        # Define layers of the transformer block using ModuleDict

        self.layers = nn.ModuleDict({
            'ln1': nn.LayerNorm(embed_dim),
            'ln2': nn.LayerNorm(embed_dim),
            'attn': MultiheadAttention(config),
            'ff': nn.Sequential(
                nn.Linear(embed_dim, embed_dim * 4),
                nn.GELU(),
                nn.Linear(embed_dim * 4, embed_dim),
                nn.Dropout(config.ff_dropout),
            )
        })

    def forward(self, x):
       # Forward pass through the transformer block
        x = x + self.layers['attn'](self.layers['ln1'](x))
        x = x + self.layers['ff'](self.layers['ln2'](x))
        return x


# Multihead Attention class

In [61]:
class MultiheadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        self.num_heads = config.num_heads
        assert embed_dim % self.num_heads == 0, "invalid heads and embedding dimension configuration"
        # Define layers of the multihead attention mechanism using ModuleDict
        self.layers = nn.ModuleDict({
            'key': nn.Linear(embed_dim, embed_dim),
            'value': nn.Linear(embed_dim, embed_dim),
            'query': nn.Linear(embed_dim, embed_dim),
            'proj': nn.Linear(embed_dim, embed_dim),
            'attn_dropout': nn.Dropout(config.attn_dropout),
            'proj_dropout': nn.Dropout(config.ff_dropout),
        })

        # Create a triangular mask to prevent attending to future tokens

        self.register_buffer(
    "mask",
    torch.tril(torch.ones(config.max_len, config.max_len), diagonal=-1).unsqueeze(0).unsqueeze(0).bool()
)



    def forward(self, x):
      # Forward pass through the multihead attention mechanism

        batch_size = x.size(0)
        seq_len = x.size(1)
        k_t = self.layers['key'](x).reshape(batch_size, seq_len, self.num_heads, -1).permute(0, 2, 3, 1)
        v = self.layers['value'](x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        q = self.layers['query'](x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)

        attn = torch.matmul(q, k_t) / math.sqrt(q.size(-1))
        mask = self.mask[:, :, :seq_len, :seq_len]
        attn = attn.masked_fill(mask == 0, float("-inf"))
        attn = self.layers['attn_dropout'](attn)
        attn = F.softmax(attn, dim=-1)
        y = torch.matmul(attn, v)
        y = y.transpose(1, 2)
        y = y.reshape(batch_size, seq_len, -1)
        y = self.layers['proj_dropout'](self.layers['proj'](y))
        return y


In [62]:
# Example usage with 125 million parameters
gpt_config = GPT1Config(vocab_size=10000, max_len=512, embed_dim=768, num_heads=12, num_blocks=12)
gpt_model = GPT(gpt_config)
input_sequence = torch.randint(0, gpt_config.vocab_size, (1, gpt_config.max_len))
output = gpt_model(input_sequence)
print(f"Output shape: {output.shape}, Model Parameters: {sum(p.numel() for p in gpt_model.parameters()):,} parameters")


Output shape: torch.Size([1, 512, 10000]), Model Parameters: 100,426,000 parameters
