# Decoder Only Transformer

**B = batch size   the no of sequences(sentences) we process in parallel**

**T = Time (sequence length / context length) Number of tokens in each sequence.  if T = 8, each input has 8 tokens (like 8 words/chars).**

**V = vocab size (number of possible tokens)**

**C = Dimensionality of the vector representing each token. Example: if C = 384, each token is mapped to a 384-dimensional embedding vector.**

**The input to a Transformer is usually shaped:**
**(B, T, C)**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from math import log
from torch.nn import functional as F

device  = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt

In [None]:
with open("input.txt","r") as f:
  text = f.read()

In [None]:
char  = sorted(list(set(text)))
vocab_size = len(char)
print(vocab_size)

# Variables

In [None]:
torch.manual_seed(1337)
batch_size = 64
block_size = 256 # Maximum context length for prediction?  no of token
max_iter = 5000
eval_iter = 200
learning_rate = 3e-4
eval_interval = 500
n_embed = 384 # Embedding Dimensions
n_head = 6
n_layer = 6
dropout = 0.25

### Creating a mapping for character

In [None]:

# we are creating dictionary to which we gave character and it give index
stoi = {ch:i for i,ch in enumerate(char)} #create dictionary with charater and its index--->{"a":1}
# we are creating dictionary to which we gave index and it give the corresponding character
itos = {i:ch for i,ch in enumerate(char)}  # create a dictionary with index and character -->{1:"a"}

encode = lambda x : [stoi[c] for c in x]
decode = lambda x :"".join([itos[i] for i in x])

print(encode("How are you"))
print(decode(encode("How are you")))

### Split the Data for Training and Validation

In [None]:
data = torch.tensor(encode(text),dtype=torch.long)  # Encode the data
n = int(0.9*len(data))
train_data = data[:n]  # 90% data for training--># elements from start (index 0) up to n-1
val_data = data[n:]  # last 10% for validation data---># elements from index n to the end


print(data[:100])

#### Data batch for training and validation

In [None]:
def get_batch(split):
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data)-block_size,(batch_size,)) # Pick random starting indices
  # We used stack to get esult shape = (batch_size, block_size).
  # Withput stack we get single dimension 1D tensor with stack
  # Build inputs x
  x = torch.stack([data[i:i+block_size] for i in ix]) # input for model like ==> 47,10,30
  # Build output y
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) #output for model like ===> 10,30,
  return x,y




xb,yb = get_batch("train")
print("Input:")
print(xb.shape)
print(xb)
print("**"*50)
print("Outputs:")
print(yb.shape)
print(yb)

print("-"*100)

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b,:t+1]
    target = yb[b,t]
    print(f"When input is: {context.tolist()} the target is: {target}")



# Self Attention Head(Single-Maske-Attention-Head)

In [None]:
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.query_vector = nn.Linear(n_embed,head_size,bias=False)
        self.key_vector = nn.Linear(n_embed,head_size,bias=False)
        self.value_vector = nn.Linear(n_embed,head_size,bias=False)
        self.register_buffer("tril",torch.tril(torch.ones(block_size,block_size)))

    def forward(self,x):
        B,T,C = x.shape
        q = self.query_vector(x)
        k = self.key_vector(x)
        v = self.value_vector(x)
        # Compute attention score
        wei = q @ k.transpose(-2,-1)*C**-0.5  # scaling & c=dimesnion of vector  (B,T,C) @ (B,C,T).T==(B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T]==0 ,float("-inf")) #(B,T,T)
        wei = F.softmax(wei,dim=-1) # (B,T,T)
        # Dot product of attention score with the value vector
        out = wei @ v  #(B,T,T) @ (B,T,C)===> (B,T,C)
        return out

## Multi Head Attetntion

In [None]:
   
class MultiHeadAttetnion(nn.Module):
    """
Args:
    num_heads (int): Number of parallel attention heads. Each head is its own self-attention block.
        Multiple heads allow the model to capture different relationships or “representation
        subspaces” of the input sequence at the same time.

    head_size (int): Dimensionality of each attention head’s output vector. Inside each head,
        the input embedding is projected into query, key, and value vectors of size head_size.

Example:
    n_embed = 32
    num_heads = 4
    head_size = 8

    - Each token embedding has 32 dimensions.
    - Each head applies Linear(32 → 8) to produce q, k, v vectors of size 8.
    - One head outputs (B, T, 8). With 4 heads, we get 4 such outputs.
    - Concatenating across heads gives (B, T, 32).

Notes:
    - Each head captures a different aspect of the token’s meaning.
    - Concatenation restores the original embedding size (32), ensuring consistent
      dimensionality across layers.
    - If num_heads * head_size ≠ n_embed, the multi-head output size will not match
      the input size. In standard transformers, a final Linear projection is used
      to resolve this mismatch.

    embedding = 32
    num_head = 4
    head_output = 8
    mean 4 head will have input of 32 dimension and converted to 8 dimesnion q,k,v vector and at the end we will have 4 output of 8 dimension we will concatenate it and get 2 dimesnion final output of multi head attention
    if embedding size and final output dimension is not sam transformer will fail

  """
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Each head learned independently, so you need a final mixing layer to let them interact. so we add a linear layer to let them interact
        # It takes the concatenated output (B, T, n_embed) and projects it back into the embedding space.
        # This way, the output of multi-head attention has the same dimension as the input embedding.
        # This is necessary so you can:
        # Add residual connections (x + attention_out)
        self.proj = nn.Linear(n_embed,n_embed)  # Basically after concatination multiple head output we get bigger dimesnion matrix to make it same as input matrix we apply linear transformation to get same ammount of dimesnion as input

    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads],dim=-1) # concatinating over the C Dimension    (B,T,C)
        out = self.proj(out)
        return out

## Feed-Forward_neuralNetwork

In [None]:
class FeedForward(nn.Module):
    def __init__(self,n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed,4*n_embed),# First layer expands the embedding dimension (hidden_dim, usually n_embed).
            nn.ReLU(),
            nn.Linear(4*n_embed,n_embed), # Second layer projects back to n_embed so the shape matches the input (needed for residual connections).
            nn.Dropout(dropout)
        )
    def forward(self,x):
        return self.net(x)

## Decoder Block

In [None]:
class Block(nn.Module):
    def __init__(self,n_embed,n_head):
        super().__init__()
        head_size = n_embed//n_head   # 32/4==>8
        self.sa = MultiHeadAttetnion(num_heads=n_head,head_size=head_size)
        self.ffd = FeedForward(n_embed=n_embed)
        self.ln1 = nn.LayerNorm(n_embed)  #Layer normalize skip-connection
        self.ln2 = nn.LayerNorm(n_embed)  #Layer normalize skip-connection

    def forward(self,x):
        # Self-attention with residual connection
        x = x + self.sa(self.ln1(x))   # we apply layer normalize directly on the input not like in transformer architecture
        # Feed-forward with residual connection
        x = x + self.ffd(self.ln2(x))  
        return x

## Model

In [None]:
# B = batch size
# T = sequence length (context length we fed in)
# V = vocab size (number of possible tokens)
class Bigramlanguagemodel(nn.Module):
    def __init__(self):
        super().__init__()
        # Step 1: Embedd the Token
        self.token_embedding_table = nn.Embedding(vocab_size,n_embed)
        # Step 2: Apply the positional Embedding same dimesnion as input embedding
        self.position_embedding_table = nn.Embedding(block_size,n_embed)  
        # step 3: Masked Attention Blocked
        self.block = nn.Sequential(*[Block(n_embed,n_head=n_head) for _ in range(n_layer)]) # how many layer of self attention we want

        self.ln_F = nn.LayerNorm(n_embed)  #fianl layer norm
        self.lm_head = nn.Linear(n_embed,vocab_size)


    def forward(self,idx,target=None): # idx is the input text, already tokenized and converted to integers from your vocabulary.
        B,T = idx.shape
        token_emb = self.token_embedding_table(idx)  # ==> (B,T,C)==>(4,8,vocab_size)==>torch.Size([4, 8, 65])
        pos_embedding = self.position_embedding_table(torch.arange(T,device=device))  # (T,C)
        x = token_emb + pos_embedding # Sum the postion embedding + token embedding  ==> (B,T,C)
        x = self.block(x)
        logits = x=self.lm_head(x) # (B,T,vocab_szie)
        loss=None
        if target is not None:
            #RuntimeError: Expected target size [4, 65], got [4, 8]
            # loss function accept (B,C)
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            target = target.view(B*T)
            loss = F.cross_entropy(logits,target)
        return logits,loss


    def generate(self,idx,max_new_token):
        # it take (B,T) and make it ==> (B,T+1,T+2...)
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_token):#get the predeiction
            #crop idx to last block size token
            idx_cond = idx[:,-block_size:]    
            logits,loss = self(idx_cond)  # self is forward function
            #focus on the last time step
            # : → keep all batches (B)
            # -1 → only take the last time step (T-1)
            # : → keep all vocab logits (C)
            logits = logits[:,-1,:] # become (B,C)
            #apply softmax to get probability
            probs = F.softmax(logits,dim=-1) #(B,C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sample index to running samples
            idx = torch.cat((idx,idx_next),dim=1) # (B,T+1)
        return idx

In [None]:
model = Bigramlanguagemodel() # Model object
optimizer = torch.optim.AdamW(model.parameters(),learning_rate)

## Loss Calculation

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train","val"]:
        losses = torch.zeros(eval_iter)
        for k in range(eval_iter):
            x,y = get_batch(split) #get batch using get_batch function
            logit,loss = model(x,y) #pass the batch to model function
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Model Training

In [None]:
for iter in range(max_iter):
    if iter % eval_interval==0:   # iter % eval_interval → the remainder when iter is divided by eval_interval.  0 mean "no remainder"
        losses = estimate_loss()
        print(f"step {iter}:train loss {losses["train"]:.4f} val loss {losses["val"]:.4f}")

        xb,yb = get_batch("train")

        optimizer.zero_grad()  #clear gradient accumulation
        logits,loss = model(xb,yb)  #pass the input to model
        loss.backward()#backpropogation
        optimizer.step() #update parameters

## Generate from the model

In [None]:
# Generate from the model
context = torch.zeros((1,1),dtype=torch.long,device=device)
output = model.generate(idx=context, max_new_token=2000)
decoded = decode(output[0].tolist())  # convert tensor → list[int]
print(decoded)

# Mathematic trick in Self Attention

torch.manual_seed(1337)
B,T,C = 4,8,2  #Batch,Time Channel
x = torch.randn(B,T,C)
x.shape

In [None]:
# Version 2
wei = torch.tril(torch.ones(T,T))
wei = wei/wei.sum(1,keepdim=True)
xbow2 = wei @ x   # (T,T) @ (B,T,C)===pytorch will create (B,T,T) @ (B,T,C) ====> (B,T,C)
wei.shape,x.shape

In [None]:
# Version  : SoftMax

tril = torch.tril(torch.ones(T,T))  # Lower traingular metrics
wei = torch.zeros((T,T)) # A zero metric of T row and T column===>> T=8

wei = wei.masked_fill(tril==0,float("-inf"))  # fill the lower triangular part with zero while the other part with '-inf' or where tril==0 make it -inf

wei = F.softmax(wei,dim=-1)  #apply softmax on it == softmax(inf)=0
#So dim=-1 ensures: "for each query token, distribute its attention across all keys, summing to 1".

xbow3 = wei @ x  # (B,T,T) @ (B,T,C) ===> (B,T,C)


# Self Attention

# 📝 Encoder Block – Cheatsheet

### Self Attention

* In **self attention** all 3 vectors (Query, Key, Value) come from the **same sequence**.
* In **multi-head attention** we have multiple self-attention layers (heads) inside the block to capture **different meanings** of the input sequence.
* In **cross attention** the Query comes from one sequence (decoder output) while Key and Value come from another sequence (encoder output).

---

### How Q, K, V are made

* Query, Key, and Value vectors are generated by applying **linear transformation** on the embedding vector.
* Usually done with a small neural network (linear layers) that learns the projection matrices.
* Then dot product is applied to form attention scores.

---

### Flow of Encoder Block

1. **Tokenize input**

   * Convert words → tokens → numbers.

2. **Embedding vector**

   * Convert token IDs → dense embedding vectors.

3. **Positional encoding**

   * Create positional encoding (same dimension as embedding).
   * Add embedding + positional encoding → gives input with position info.

4. **Self Attention layer**

   * Generate Q, K, V vectors using linear transformation.
   * Compute dot product: ( w = Q \times K^T ).
   * Normalize: divide by (\sqrt{\text{dimension of K}}).
   * Apply softmax → get normalized weights in range 0–1.
   * Multiply with Value vector: ( y = w \times V ).

5. **Feed Forward Neural Network (FFN)**

   * Pass the result into a neural network.
   * Apply **ReLU activation** to add non-linearity.
   * Output layer of the FFN has same dimension as embedding vector.

6. **Final Output**

   * Encoder output = **contextual embedding** of each token.


# 📝 Decoder Block – Cheatsheet

### Cross vs Self Attention

* **Decoder has 2 attention layers**:

  1. **Masked Self Attention** → lets decoder look at past tokens only no futer token done by ==> wei.masked_fill(tril==0,float("-inf")) .
  2. **Cross Attention** → Query comes from decoder, Key + Value come from encoder output.

---

### Flow of Decoder Block

1. **Tokenize target sequence**

   * Convert target sentence (shifted right during training to add <SOS>) → tokens → numbers.

2. **Embedding vector**

   * Convert token IDs → dense embedding vectors.

3. **Positional encoding**

   * Add positional encoding to embeddings(same dimension as input embedding).

4. **Masked Self Attention**

   * Generate Q, K, V from decoder input (like encoder).
   * Apply mask (future tokens hidden) ==> wei.masked_fill(tril==0,float("-inf")).
   * Compute attention: ( w = softmax(Q.K.T/sqrt(Dimesnion of K))) . V
   * This ensures each position only attends to **previous tokens**.

5. **Cross Attention**

   * Query (Q) from decoder hidden states.
   * Key (K), Value (V) from encoder output.
   * Attention = how decoder tokens attend to encoder tokens.
   * Output is contextual info combining encoder + decoder states.

6. **Feed Forward Neural Network (FFN)**

   * Pass attention output into FFN with ReLU for non-linearity.
   * Output dimension = same as embedding size.

7. **Final Softmax Layer (Prediction)**

   * After stacking decoder blocks, final layer projects output → vocabulary size.
   * Apply softmax → gives probability distribution of next token.

---


In [None]:
# Version 4 : Self Attention
# If We want Encoder Block only  we remove the masking (wei.masked_fill(tril==0,float("-inf")))
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)  # Dummy user input

head_size = 16  #no of Mult-Head Attention (to capture differnt meaning of sentence)
query_vector = nn.Linear(C,head_size,bias=False)
key_vector = nn.Linear(C,head_size,bias=False)
value_vector = nn.Linear(C,head_size,bias=False)
q = query_vector(x)  #(B,T,16)
k = key_vector(x)  # (B,T,16)
wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T)===>(B,T,T)


tril = torch.tril(torch.ones(T,T))  # Lower traingular metrics
wei = wei.masked_fill(tril==0,float("-inf"))  # fill the lower triangular part with zero while the other part with '-inf' or where tril==0 make it -inf
wei = F.softmax(wei,dim=-1)  #apply softmax on it == softmax(inf)=0  # So dim=-1 ensures: "for each query token, distribute its attention across all keys, summing to 1".
v = value_vector(x)
out = wei @ v  # multiplt softmax output with value vector ==(B,T,head_size)
out.shape 


# Why we scaled the softmax values

1. For Larger dimesnion the output do dot product of q @ k is high and variance become high
2. For lower dimension vector the output do dot product of q @ k is low and variance become low
3. And after applying softmax the value with higher variance get more probability compare to samlle variance values

In [None]:
q = torch.randn(B,T,head_size)
k = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2,-1)  

In [None]:
#without scaling variance is very high and not preserved
k.var(),q.var(),wei.var()

In [None]:
# After applying scaling
q = torch.randn(B,T,head_size)
k = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2,-1)*head_size**-0.5

In [None]:
# After scaling variance is preserved
k.var(),q.var(),wei.var()