In [19]:
import torch
import torch.nn as nn
import math

# =============================================
# LayerNormalization (from earlier)
# =============================================
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))   # Scale
        self.bias = nn.Parameter(torch.zeros(features))   # Shift
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

# =============================================
# ResidualConnection
# =============================================
class ResidualConnection(nn.Module):
    """
    Implements: x + Dropout(Sublayer(LayerNorm(x))) , original in the paper -LayerNormalization(x+Sublayer(x))
    
    This is the "Pre-LN" variant where LayerNorm comes BEFORE the sublayer.
    Original paper used "Post-LN": LayerNorm(x + Sublayer(x))
    """
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)
    
    def forward(self, x, sublayer):
        """
        Args:
            x: Input tensor (batch, seq_len, features)
            sublayer: A function/module (e.g., attention or feedforward)
        
        Returns:
            x + dropout(sublayer(norm(x)))
        """
        return x + self.dropout(sublayer(self.norm(x)))

# =============================================
# Dummy Sublayer (FeedForward example)
# =============================================
class SimpleFeedForward(nn.Module):
    """Simple 2-layer feedforward for demo"""
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))


In [20]:
# =============================================
# DEMO: Step-by-step execution
# =============================================
print("=== RESIDUAL CONNECTION DEMO ===\n")

# Setup
batch_size = 2 # number of sentences
seq_len = 3 # number of tokens in each sentence 
d_model = 4 #feature dimension 
d_ff = 8 ##hidden layer dimension
dropout = 0.1 

# Input: (batch, seq, features)
x = torch.tensor([[[1.0, 2.0, 3.0, 4.0],
                   [5.0, 6.0, 7.0, 8.0],
                   [9.0, 10.0, 11.0, 12.0]],

                  [[13.0, 14.0, 15.0, 16.0],
                   [17.0, 18.0, 19.0, 20.0],
                   [21.0, 22.0, 23.0, 24.0]]])

print("Step 1 - Input x:")
print("Shape:", x.shape)  # (2, 3, 4)
print(x[0])  # Show first batch
print()

=== RESIDUAL CONNECTION DEMO ===

Step 1 - Input x:
Shape: torch.Size([2, 3, 4])
tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])



In [21]:
# Create sublayer (feedforward)
sublayer = SimpleFeedForward(d_model, d_ff)
sublayer 

SimpleFeedForward(
  (linear1): Linear(in_features=4, out_features=8, bias=True)
  (linear2): Linear(in_features=8, out_features=4, bias=True)
)

In [9]:
# Create residual connection
residual = ResidualConnection(features=d_model, dropout=dropout)
residual

ResidualConnection(
  (dropout): Dropout(p=0.1, inplace=False)
  (norm): LayerNormalization()
)

In [22]:
# ============================================
# MANUAL STEP-BY-STEP (what happens inside) --  x + Dropout(Sublayer(LayerNorm(x)))
# ============================================
print("Step 2 - Apply LayerNorm(x):")
normalized = residual.norm(x)
print("Shape:", normalized.shape)  # (2, 3, 4)
print('non-normalized and normalized input :',x,normalized)
print("normalized[0]:", normalized[0])
print("Check: mean≈0, std≈1?")
print("  Mean:", normalized[0].mean(dim=-1))
print("  Std:", normalized[0].std(dim=-1))
print()

Step 2 - Apply LayerNorm(x):
Shape: torch.Size([2, 3, 4])
non-normalized and normalized input : tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]],

        [[13., 14., 15., 16.],
         [17., 18., 19., 20.],
         [21., 22., 23., 24.]]]) tensor([[[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]],

        [[-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619],
         [-1.1619, -0.3873,  0.3873,  1.1619]]], grad_fn=<AddBackward0>)
normalized[0]: tensor([[-1.1619, -0.3873,  0.3873,  1.1619],
        [-1.1619, -0.3873,  0.3873,  1.1619],
        [-1.1619, -0.3873,  0.3873,  1.1619]], grad_fn=<SelectBackward0>)
Check: mean≈0, std≈1?
  Mean: tensor([2.9802e-08, 2.9802e-08, 2.9802e-08], grad_fn=<MeanBackward1>)
  Std: tensor([1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>)



In [26]:
print("Step 3 - Apply Sublayer (FeedForward):")
sublayer_out = sublayer(normalized)
print("Shape:", sublayer_out.shape)  # (2, 3, 4)
print("sublayer_out:",sublayer_out)
print()

Step 3 - Apply Sublayer (FeedForward):
Shape: torch.Size([2, 3, 4])
sublayer_out: tensor([[[ 0.0744, -0.3077, -0.0528, -0.0717],
         [ 0.0744, -0.3077, -0.0528, -0.0717],
         [ 0.0744, -0.3077, -0.0528, -0.0717]],

        [[ 0.0744, -0.3077, -0.0528, -0.0717],
         [ 0.0744, -0.3077, -0.0528, -0.0717],
         [ 0.0744, -0.3077, -0.0528, -0.0717]]], grad_fn=<ViewBackward0>)



In [27]:
print("Step 4 - Apply Dropout:")
residual.dropout.train()  # Enable dropout
dropped = residual.dropout(sublayer_out)
print("Shape:", dropped.shape)  # (2, 3, 4)
print("dropped:", dropped)
print("Notice: ~10% elements zeroed/scaled")
print()

Step 4 - Apply Dropout:
Shape: torch.Size([2, 3, 4])
dropped: tensor([[[ 0.0827, -0.3419, -0.0587, -0.0797],
         [ 0.0827, -0.3419, -0.0587, -0.0797],
         [ 0.0827, -0.3419, -0.0587, -0.0797]],

        [[ 0.0000, -0.3419, -0.0587, -0.0797],
         [ 0.0827, -0.3419, -0.0587, -0.0797],
         [ 0.0827, -0.3419, -0.0587, -0.0797]]], grad_fn=<MulBackward0>)
Notice: ~10% elements zeroed/scaled



In [29]:
print("Step 5 - ADD residual (x + dropped):")
output = x + dropped
print("Shape:", output.shape)  # (2, 3, 4)
print("output:", output)
print()

Step 5 - ADD residual (x + dropped):
Shape: torch.Size([2, 3, 4])
output: tensor([[[ 1.0827,  1.6581,  2.9413,  3.9203],
         [ 5.0827,  5.6581,  6.9413,  7.9203],
         [ 9.0827,  9.6581, 10.9413, 11.9203]],

        [[13.0000, 13.6581, 14.9413, 15.9203],
         [17.0827, 17.6581, 18.9413, 19.9203],
         [21.0827, 21.6581, 22.9413, 23.9203]]], grad_fn=<AddBackward0>)



In [30]:
print("Compare magnitudes:")
print("  Input x[0,0]:", x[0, 0])
print("  Sublayer only:", sublayer_out[0, 0])
print("  With residual:", output[0, 0])
print("  → Residual keeps original signal!")
print()

Compare magnitudes:
  Input x[0,0]: tensor([1., 2., 3., 4.])
  Sublayer only: tensor([ 0.0744, -0.3077, -0.0528, -0.0717], grad_fn=<SelectBackward0>)
  With residual: tensor([1.0827, 1.6581, 2.9413, 3.9203], grad_fn=<SelectBackward0>)
  → Residual keeps original signal!



In [31]:
# ============================================
# END-TO-END using forward()
# ============================================
print("Step 6 - END-TO-END: residual(x, sublayer)")
direct_output = residual(x, sublayer)
print("Shape:", direct_output.shape)
print("direct_output:", direct_output)
print()

Step 6 - END-TO-END: residual(x, sublayer)
Shape: torch.Size([2, 3, 4])
direct_output: tensor([[[ 1.0827,  1.6581,  2.9413,  3.9203],
         [ 5.0827,  5.6581,  6.9413,  7.9203],
         [ 9.0000,  9.6581, 11.0000, 11.9203]],

        [[13.0827, 13.6581, 14.9413, 15.9203],
         [17.0000, 18.0000, 18.9413, 19.9203],
         [21.0827, 21.6581, 22.9413, 23.9203]]], grad_fn=<AddBackward0>)



In [17]:

# ============================================
# Gradient flow demonstration
# ============================================
print("Step 7 - GRADIENT FLOW (why residuals matter)")
x_grad = torch.randn(2, 3, 4, requires_grad=True)
sublayer_grad = SimpleFeedForward(d_model, d_ff)
residual_grad = ResidualConnection(d_model, dropout=0.0)  # No dropout for clarity

# Forward
out = residual_grad(x_grad, sublayer_grad)
loss = out.sum()

# Backward
loss.backward()

print("Gradient of input:", x_grad.grad[0, 0])
print("Notice: gradient is non-zero even if sublayer gradient vanishes!")
print("This is because: ∂(x + f(x))/∂x = 1 + ∂f/∂x")
print()




Step 7 - GRADIENT FLOW (why residuals matter)
Gradient of input: tensor([0.8409, 0.8771, 0.7823, 1.4997])
Notice: gradient is non-zero even if sublayer gradient vanishes!
This is because: ∂(x + f(x))/∂x = 1 + ∂f/∂x



In [32]:
# ============================================
# Visualize residual vs no residual
# ============================================
print("Step 8 - RESIDUAL vs NO RESIDUAL comparison")
print("="*60)

# Without residual
no_residual_out = sublayer(residual.norm(x))
print("WITHOUT residual (just sublayer output):")
print("  Shape:", no_residual_out.shape)
print("  Range:", no_residual_out.min().item(), "to", no_residual_out.max().item())
print()

# With residual
with_residual_out = x + residual.dropout(sublayer(residual.norm(x)))
print("WITH residual (x + sublayer):")
print("  Shape:", with_residual_out.shape)
print("  Range:", with_residual_out.min().item(), "to", with_residual_out.max().item())
print()

print("✅ Residual connection preserves input information while adding new features!")

Step 8 - RESIDUAL vs NO RESIDUAL comparison
WITHOUT residual (just sublayer output):
  Shape: torch.Size([2, 3, 4])
  Range: -0.3077440857887268 to 0.07440011203289032

WITH residual (x + sublayer):
  Shape: torch.Size([2, 3, 4])
  Range: 1.0 to 23.92030906677246

✅ Residual connection preserves input information while adding new features!
