In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Plotting setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Ready to implement attention mechanisms!")


In [None]:
class BasicAttention(nn.Module):
    """Basic attention mechanism implementation"""
    
    def __init__(self, hidden_dim):
        super(BasicAttention, self).__init__()
        self.hidden_dim = hidden_dim
        
        # Linear layers for computing attention scores
        self.attention = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, query, keys, values):
        """
        Args:
            query: [batch_size, hidden_dim] - current decoder state
            keys: [batch_size, seq_len, hidden_dim] - encoder outputs
            values: [batch_size, seq_len, hidden_dim] - encoder outputs (same as keys)
        
        Returns:
            context: [batch_size, hidden_dim] - attended context vector
            attention_weights: [batch_size, seq_len] - attention weights
        """
        batch_size, seq_len, hidden_dim = keys.size()
        
        # Expand query to match keys dimensions
        # query: [batch_size, hidden_dim] -> [batch_size, seq_len, hidden_dim]
        query_expanded = query.unsqueeze(1).expand(batch_size, seq_len, hidden_dim)
        
        # Concatenate query and keys
        # combined: [batch_size, seq_len, hidden_dim * 2]
        combined = torch.cat([query_expanded, keys], dim=2)
        
        # Compute attention scores
        # energy: [batch_size, seq_len, hidden_dim]
        energy = torch.tanh(self.attention(combined))
        
        # energy: [batch_size, seq_len, 1] -> [batch_size, seq_len]
        attention_scores = self.v(energy).squeeze(2)
        
        # Convert to attention weights using softmax
        attention_weights = F.softmax(attention_scores, dim=1)
        
        # Compute context vector as weighted sum of values
        # attention_weights: [batch_size, seq_len] -> [batch_size, seq_len, 1]
        attention_weights_expanded = attention_weights.unsqueeze(2)
        
        # context: [batch_size, hidden_dim]
        context = torch.sum(attention_weights_expanded * values, dim=1)
        
        return context, attention_weights

# Test the basic attention mechanism
def test_basic_attention():
    """Test the basic attention implementation"""
    
    batch_size = 2
    seq_len = 5
    hidden_dim = 8
    
    # Create test data
    query = torch.randn(batch_size, hidden_dim)
    keys = torch.randn(batch_size, seq_len, hidden_dim)
    values = keys.clone()  # In basic attention, values = keys
    
    # Initialize attention layer
    attention_layer = BasicAttention(hidden_dim)
    
    # Forward pass
    context, attention_weights = attention_layer(query, keys, values)
    
    print("BASIC ATTENTION TEST")
    print("=" * 30)
    print(f"Query shape: {query.shape}")
    print(f"Keys shape: {keys.shape}")
    print(f"Values shape: {values.shape}")
    print()
    print(f"Context shape: {context.shape}")
    print(f"Attention weights shape: {attention_weights.shape}")
    print()
    print("Attention weights (should sum to 1):")
    for i in range(batch_size):
        weights = attention_weights[i].detach().numpy()
        print(f"  Batch {i}: {weights}")
        print(f"  Sum: {weights.sum():.6f}")
    
    return context, attention_weights, attention_layer

# Run the test
context, weights, attention_layer = test_basic_attention()


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
class BasicAttention(nn.Module):
    """Basic attention mechanism implementation"""
    
    def __init__(self, hidden_dim):
        super(BasicAttention, self).__init__()
        self.hidden_dim = hidden_dim
        
        # Linear layers for computing attention scores
        self.attention = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, decoder_hidden, encoder_outputs):
        """
        Args:
            decoder_hidden: (batch_size, hidden_dim) - current decoder state
            encoder_outputs: (batch_size, seq_len, hidden_dim) - all encoder states
        
        Returns:
            context_vector: (batch_size, hidden_dim) - weighted sum of encoder outputs
            attention_weights: (batch_size, seq_len) - attention weights
        """
        batch_size, seq_len, hidden_dim = encoder_outputs.size()
        
        # Repeat decoder hidden state for each encoder position
        # (batch_size, seq_len, hidden_dim)
        decoder_hidden_repeated = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Concatenate decoder and encoder states
        # (batch_size, seq_len, hidden_dim * 2)
        combined = torch.cat([decoder_hidden_repeated, encoder_outputs], dim=2)
        
        # Compute attention scores
        # (batch_size, seq_len, hidden_dim)
        energy = torch.tanh(self.attention(combined))
        
        # Convert to scalar attention scores
        # (batch_size, seq_len, 1) -> (batch_size, seq_len)
        attention_scores = self.v(energy).squeeze(2)
        
        # Convert scores to probabilities (attention weights)
        attention_weights = F.softmax(attention_scores, dim=1)
        
        # Compute context vector as weighted sum of encoder outputs
        # (batch_size, 1, seq_len) x (batch_size, seq_len, hidden_dim) -> (batch_size, 1, hidden_dim)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        context_vector = context_vector.squeeze(1)  # (batch_size, hidden_dim)
        
        return context_vector, attention_weights

# Test the attention mechanism
hidden_dim = 128
seq_len = 6
batch_size = 2

# Create test data
attention_layer = BasicAttention(hidden_dim)
decoder_hidden = torch.randn(batch_size, hidden_dim)
encoder_outputs = torch.randn(batch_size, seq_len, hidden_dim)

# Forward pass
context_vector, attention_weights = attention_layer(decoder_hidden, encoder_outputs)

print("Attention Mechanism Test:")
print(f"Decoder hidden shape: {decoder_hidden.shape}")
print(f"Encoder outputs shape: {encoder_outputs.shape}")
print(f"Context vector shape: {context_vector.shape}")
print(f"Attention weights shape: {attention_weights.shape}")
print(f"Attention weights sum: {attention_weights.sum(dim=1)}")  # Should be 1.0
print(f"Sample attention weights: {attention_weights[0].detach().numpy()}")
