In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import math 
import numpy as np 
import torch.optim as optim

In [2]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size,embed_dim):
        super(TokenEmbedding, self).__init__()
        self.voacab_size = vocab_size
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        nn.init.normal_(self.embedding.weight,mean=0.0,std=0.02)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.embed_dim)

In [3]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len,embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.max_seq_len = max_seq_len
        self.embed_dim = embed_dim
        self.positional_embedding = nn.Embedding(max_seq_len, embed_dim)

        nn.init.normal_(self.positional_embedding.weight,mean=0.0,std=0.02)

    def forward(self, x):
        batch_size,seq_len = x.shape[0],x.shape[1]
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
        return self.positional_embedding(positions) 
        

In [4]:
class SinusoidalPositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len,embed_dim):
        super(SinusoidalPositionalEmbedding, self).__init__()
        self.max_seq_len = max_seq_len
        self.embed_dim = embed_dim

        pe = torch.zeros(max_seq_len, embed_dim)
        position = torch.arange(0,max_seq_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0,embed_dim,2).float()* - (math.log(10000.0)/embed_dim))
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)

    def forward(self,x):
        batch_size,seq_len =x.shape[0],x.shape[1]
        return self.pe[:seq_len].unsqueeze(0).expand(batch_size, -1, -1)
    

In [5]:
class GPTEmbedding(nn.Module):
    def __init__(self,vocab_size,embed_dim,max_seq_len,dropout=0.1,use_sinusoidal= False):
        super(GPTEmbedding, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

        self.token_embedding = TokenEmbedding(vocab_size, embed_dim)
        if use_sinusoidal:
            self.pos_embedding = SinusoidalPositionalEmbedding(max_seq_len, embed_dim)
        else:
            self.pos_embedding = PositionalEmbedding(max_seq_len, embed_dim)

        #dropout layer
        self.dropout = nn.Dropout(dropout)

        #layer norm 
        self.layer_norm = nn.LayerNorm(embed_dim)
    
    def forward(self,input_ids,use_layer_norm = True):
        #token embedding
        token_embedding = self.token_embedding(input_ids)
        #positional embedding
        pos_embedding = self.pos_embedding(token_embedding)
        #add two embedding together
        embedding = token_embedding + pos_embedding
        #apply layer normalization 
        if use_layer_norm:
            embedding = self.layer_norm(embedding)
        #apply dropout
        embedding = self.dropout(embedding)
        return embedding

In [9]:
class GPTEmbeddingConfig:
    def __init__(self, vocab_size=50257, embed_dim=768, max_seq_len=1024, 
                 dropout=0.1, use_sinusoidal=False):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        self.dropout = dropout
        self.use_sinusoidal = use_sinusoidal

def create_gpt_embedding(config):
    return GPTEmbedding(
        vocab_size=config.vocab_size,
        embed_dim=config.embed_dim,
        max_seq_len=config.max_seq_len,
        dropout=config.dropout,
        use_sinusoidal=config.use_sinusoidal
    )


In [10]:
config = GPTEmbeddingConfig(
    vocab_size=50257,  # GPT-2 vocabulary size
    embed_dim=768,     # GPT-2 small embedding dimension
    max_seq_len=1024,  # Maximum sequence length
    dropout=0.1,       # Dropout rate
    use_sinusoidal=False  # Use learned positional embeddings
)

In [11]:
# Create embedding layer
embedding_layer = create_gpt_embedding(config)

In [12]:
batch_size = 2
seq_len = 10

In [13]:
input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))

In [14]:
print(f"Input shape: {input_ids.shape}")
print(f"Input IDs: {input_ids}")

Input shape: torch.Size([2, 10])
Input IDs: tensor([[ 9161, 40352, 37484, 49249, 42287, 48920, 41276, 36679, 39219,  3621],
        [24871, 44604, 27175, 18944,  9701, 29295, 12688, 50128, 20299, 47859]])


In [15]:
embeddings = embedding_layer(input_ids)

In [16]:
embeddings

tensor([[[-0.0000, -0.9064,  1.4496,  ..., -0.4646, -0.0521,  0.1483],
         [ 0.3806,  0.1493, -0.5608,  ...,  2.6209, -2.1414,  0.9669],
         [ 0.5841, -2.0113,  0.3768,  ..., -0.5104, -1.9701,  0.1873],
         ...,
         [-0.3686,  0.6418,  0.0000,  ...,  1.4214, -0.0256, -0.3128],
         [ 0.7905, -1.2990,  0.2506,  ...,  0.0000,  0.6091, -1.0938],
         [ 0.2463,  0.6668, -1.2025,  ..., -0.4090, -1.7647, -0.9470]],

        [[ 1.0331, -1.1924,  1.3248,  ..., -0.7028,  0.6387, -1.9860],
         [ 0.8715,  1.5800, -1.0191,  ...,  0.5569,  0.1109,  2.1714],
         [-0.0000,  1.0864,  2.6517,  ...,  0.4587, -0.3082, -1.5197],
         ...,
         [-0.6633,  0.0000,  1.7616,  ...,  0.7605,  0.0335, -2.8521],
         [-1.5386, -0.4770, -0.1424,  ...,  0.0578, -0.7824,  1.3027],
         [-1.7761,  0.8889, -1.9276,  ..., -0.3346, -1.0794,  0.4307]]],
       grad_fn=<MulBackward0>)

In [17]:
print(f"Output embeddings shape: {embeddings.shape}")
print(f"Expected shape: ({batch_size}, {seq_len}, {config.embed_dim})")

Output embeddings shape: torch.Size([2, 10, 768])
Expected shape: (2, 10, 768)


In [18]:
embeddings_with_ln = embedding_layer(input_ids, use_layer_norm=True)
print(f"Output with layer norm shape: {embeddings_with_ln.shape}")

Output with layer norm shape: torch.Size([2, 10, 768])


In [19]:
token_emb = embedding_layer.token_embedding(input_ids)
print(f"Token embeddings shape: {token_emb.shape}")

Token embeddings shape: torch.Size([2, 10, 768])


In [20]:
pos_emb = embedding_layer.pos_embedding(token_emb)
print(f"Positional embeddings shape: {pos_emb.shape}")

Positional embeddings shape: torch.Size([2, 10, 768])


In [21]:
config_sin = GPTEmbeddingConfig(use_sinusoidal=True)
embedding_layer_sin = create_gpt_embedding(config_sin)
embeddings_sin = embedding_layer_sin(input_ids)
print(f"Sinusoidal embeddings shape: {embeddings_sin.shape}")

Sinusoidal embeddings shape: torch.Size([2, 10, 768])


In [22]:
total_params = sum(p.numel() for p in embedding_layer.parameters())
print(f"\nTotal parameters in embedding layer: {total_params:,}")


Total parameters in embedding layer: 39,385,344


In [23]:
token_params = embedding_layer.token_embedding.embedding.weight.numel()

In [24]:
token_params

38597376

In [26]:
pos_params = embedding_layer.pos_embedding.positional_embedding.weight.numel()
pos_params

786432

In [28]:
ln_params = sum(p.numel() for p in embedding_layer.layer_norm.parameters())
ln_params

1536

In [29]:
print(f"Token embedding parameters: {token_params:,}")
print(f"Positional embedding parameters: {pos_params:,}")
print(f"Layer norm parameters: {ln_params:,}")

Token embedding parameters: 38,597,376
Positional embedding parameters: 786,432
Layer norm parameters: 1,536


In [30]:
#testing gradient flow 
embeddings.sum().backward()

In [31]:
print(f"\nEmbedding statistics:")
print(f"Mean: {embeddings.mean().item():.4f}")
print(f"Std: {embeddings.std().item():.4f}")
print(f"Min: {embeddings.min().item():.4f}")
print(f"Max: {embeddings.max().item():.4f}")


Embedding statistics:
Mean: -0.0033
Std: 1.0500
Min: -4.0809
Max: 4.1776
