# User encoder

## User Architecture

In [None]:
class MultiHeadAdditiveAttention(nn.Module):
    """
    Implements multi-head additive attention for user encoding,
    strictly following the provided formulas.
    """
    def __init__(self, embed_dim, num_heads):
        """
        Args:
        - embed_dim: Dimensionality of the input embeddings (news representations).
        - num_heads: Number of attention heads.
        """
        super(MultiHeadAdditiveAttention, self).__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Parameters for each head
        self.Q_n = nn.ParameterList([nn.Parameter(torch.randn(self.head_dim, self.head_dim)) for _ in range(num_heads)])
        self.V_n = nn.ParameterList([nn.Parameter(torch.randn(self.head_dim, self.head_dim)) for _ in range(num_heads)])

    def forward(self, news_embeddings):
        """
        Args:
        - news_embeddings: Tensor of shape (batch_size, num_news, embed_dim),
                           representing the news representations.

        Returns:
        - enhanced_news_embeddings: Tensor of shape (batch_size, num_news, embed_dim),
                                    enhanced news representations.
        - attention_weights: List of tensors of shape (batch_size, num_news, num_news) per head,
                             attention weights for each head.
        """
        batch_size, num_news, embed_dim = news_embeddings.size()

        # Split embeddings for each head
        news_per_head = news_embeddings.view(batch_size, num_news, self.num_heads, self.head_dim).transpose(1, 2)

        all_head_outputs = []
        all_attention_weights = []

        for h in range(self.num_heads):
            Q_n = self.Q_n[h]
            scores = torch.einsum('bnd,dk,bmk->bnm', news_per_head[:, h, :, :], Q_n, news_per_head[:, h, :, :])
            attention_weights = F.softmax(scores, dim=-1)
            V_n = self.V_n[h]
            head_output = torch.einsum('bnm,bmd,dk->bnd', attention_weights, news_per_head[:, h, :, :], V_n)

            all_head_outputs.append(head_output)
            all_attention_weights.append(attention_weights)

        concat_output = torch.cat(all_head_outputs, dim=-1)
        return concat_output, all_attention_weights


Test

In [None]:
# Test configuration
batch_size = 4
num_news = 10
embed_dim = 300
num_heads = 10

# Simulated input
news_embeddings = torch.randn(batch_size, num_news, embed_dim)

# Initialize and test the module
attention_layer = MultiHeadAdditiveAttention(embed_dim=embed_dim, num_heads=num_heads)
enhanced_news_embeddings, attention_weights = attention_layer(news_embeddings)

# Validate outputs
print("Enhanced News Embeddings Shape:", enhanced_news_embeddings.shape)  # Expected: (batch_size, num_news, embed_dim)
print("Attention Weights Shape (per head):", attention_weights[0].shape)  # Expected: (batch_size, num_news, num_news)


In [None]:
class UserAdditiveAttention(nn.Module):
    """
    Implements additive attention for user encoding based on the provided formulas.
    """
    def __init__(self, embed_dim):
        """
        Args:
        - embed_dim: Dimensionality of the input embeddings (news representations).
        """
        super(UserAdditiveAttention, self).__init__()
        self.V_n = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_n = nn.Parameter(torch.zeros(embed_dim))
        self.q_n = nn.Parameter(torch.randn(embed_dim))

    def forward(self, news_embeddings):
        """
        Args:
        - news_embeddings: Tensor of shape (batch_size, num_news, embed_dim),
                           representing the news representations.

        Returns:
        - user_representation: Tensor of shape (batch_size, embed_dim),
                               the final user representation.
        - attention_weights: Tensor of shape (batch_size, num_news),
                             attention weights for the news articles.
        """
        transformed_news = self.V_n(news_embeddings)
        scores = torch.tanh(transformed_news + self.v_n)
        scores = torch.einsum('bnd,d->bn', scores, self.q_n)
        attention_weights = F.softmax(scores, dim=1)
        user_representation = torch.einsum('bn,bnd->bd', attention_weights, news_embeddings)
        return user_representation, attention_weights

Test

In [None]:
# Parameters
batch_size = 16
num_news = 10  # Number of news articles browsed by each user
embed_dim = 300

# Dummy input
news_embeddings = torch.rand(batch_size, num_news, embed_dim)  # Random news embeddings

# Initialize the UserAdditiveAttention layer
user_attention_layer = UserAdditiveAttention(embed_dim=embed_dim)

# Forward pass
user_representation, attention_weights = user_attention_layer(news_embeddings)

# Output shapes
print("User representation shape:", user_representation.shape)  # Expected: (16, 300)
print("Attention weights shape:", attention_weights.shape)  # Expected: (16, 10)


In [None]:
class UserEncoder(nn.Module):
    """
    Combines MultiHeadAdditiveAttention and UserAdditiveAttention
    to encode user representations based on news embeddings.
    """
    def __init__(self, embed_dim, num_heads):
        """
        Args:
        - embed_dim: Dimensionality of the input embeddings (news representations).
        - num_heads: Number of attention heads in the MultiHeadAdditiveAttention layer.
        """
        super(UserEncoder, self).__init__()
        self.multi_head_attention = MultiHeadAdditiveAttention(embed_dim=embed_dim, num_heads=num_heads)
        self.user_attention = UserAdditiveAttention(embed_dim=embed_dim)

    def forward(self, news_embeddings):
        """
        Args:
        - news_embeddings: Tensor of shape (batch_size, num_news, embed_dim),
                           representing the news representations.

        Returns:
        - user_representation: Tensor of shape (batch_size, embed_dim),
                               the final user representation.
        - attention_weights: Dictionary containing:
            - 'multi_head_attention': List of tensors of shape (batch_size, num_news, num_news) per head,
                                      attention weights for each head from the MultiHeadAdditiveAttention layer.
            - 'user_attention': Tensor of shape (batch_size, num_news),
                                attention weights for the news articles from the UserAdditiveAttention layer.
        """
        enhanced_news_embeddings, multi_head_attention_weights = self.multi_head_attention(news_embeddings)
        user_representation, user_attention_weights = self.user_attention(enhanced_news_embeddings)

        attention_weights = {
            'multi_head_attention': multi_head_attention_weights,
            'user_attention': user_attention_weights
        }
        return user_representation, attention_weights

Test

In [None]:
import torch

# Define parameters
embed_dim = 300    # Embedding dimension
num_heads = 10     # Number of attention heads
batch_size = 4     # Number of users in the batch
num_articles = 5   # Number of articles browsed by each user

# Simulate input data (news representations from News Encoder)
user_input = torch.randn(batch_size, num_articles, embed_dim)  # Random tensor for simulation

# Initialize the UserEncoder
user_encoder = UserEncoder(embed_dim=embed_dim, num_heads=num_heads)


Test

In [None]:
# Parameters
batch_size = 16
num_news = 10  # Number of news articles browsed by each user
embed_dim = 300
num_heads = 10

# Dummy input
news_embeddings = torch.rand(batch_size, num_news, embed_dim)  # Random news embeddings

# Initialize the UserEncoder
user_encoder = UserEncoder(embed_dim=embed_dim, num_heads=num_heads)

# Forward pass
user_representation, attention_weights = user_encoder(news_embeddings)

# Output shapes
print("User representation shape:", user_representation.shape)  # Expected: (16, 300)
print("Multi-head attention weights shape (head 0):", attention_weights['multi_head_attention'][0].shape)  # Expected: (16, 10, 10)
print("User attention weights shape:", attention_weights['user_attention'].shape)  # Expected: (16, 10)


Tests
Test the Execution of every aspect of the Encoder

In [None]:
def test_user_encoder_initialization(user_encoder):
    assert user_encoder is not None, "UserEncoder initialization failed."
    print("UserEncoder initialization test passed.")

def test_user_encoder_forward(user_encoder, news_embeddings):
    user_representation, attention_weights = user_encoder(news_embeddings)
    assert user_representation.shape == (16, 64), "User representation shape mismatch!"
    assert 'multi_head_attention' in attention_weights, "Missing multi_head_attention key!"
    assert 'user_attention' in attention_weights, "Missing user_attention key!"
    print("UserEncoder forward pass test passed.")

def test_multi_head_attention(multi_head_attention, news_embeddings):
    enhanced_news_embeddings, attention_weights = multi_head_attention(news_embeddings)
    assert enhanced_news_embeddings.shape == (16, 10, 64), "Multi-head attention output shape mismatch!"
    print("MultiHeadAdditiveAttention test passed.")

def test_user_additive_attention(user_attention, news_embeddings):
    user_representation, attention_weights = user_attention(news_embeddings)
    assert user_representation.shape == (16, 64), "User representation shape mismatch!"
    assert attention_weights.shape == (16, 10), "Attention weights shape mismatch!"
    print("UserAdditiveAttention test passed.")




In [None]:
def main():
    # Initialize test inputs
    batch_size = 16
    num_news = 10
    embed_dim = 64
    num_heads = 4

    # Create random news embeddings
    news_embeddings = torch.randn(batch_size, num_news, embed_dim)
    print("News embeddings initialized:", news_embeddings.shape)

    # Initialize UserEncoder
    user_encoder = UserEncoder(embed_dim=embed_dim, num_heads=num_heads)
    print("UserEncoder initialized.")

    # Test: Initialization
    test_user_encoder_initialization(user_encoder)

    # Test: Forward pass
    test_user_encoder_forward(user_encoder, news_embeddings)

    # Test: MultiHeadAdditiveAttention
    multi_head_attention = MultiHeadAdditiveAttention(embed_dim=embed_dim, num_heads=num_heads)
    test_multi_head_attention(multi_head_attention, news_embeddings)

    # Test: UserAdditiveAttention
    user_attention = UserAdditiveAttention(embed_dim=embed_dim)
    test_user_additive_attention(user_attention, news_embeddings)

if __name__ == "__main__":
    main()

Everything seems to be executed

In [None]:
# Debugging Script

import torch

# Define Parameters
batch_size = 4    # Number of users in the batch
num_articles = 5  # Number of articles browsed per user
embed_dim = 300   # Embedding dimension (output from News Encoder)
num_heads = 10    # Number of attention heads

# Generate Input Data
user_input = torch.randn(batch_size, num_articles, embed_dim)
print("User Input Shape:", user_input.shape)

# Initialize the User Encoder
try:
    user_encoder = UserEncoder(embed_dim=embed_dim, num_heads=num_heads)
    print("UserEncoder initialized successfully.")
except Exception as e:
    print(f"Error during UserEncoder initialization: {e}")

# Forward Pass
try:
    user_representation, attention_weights = user_encoder(user_input)
    print("Forward pass completed successfully.")
except Exception as e:
    print(f"Error during forward pass: {e}")

# Validate Output Shapes
try:
    print("User Representation Shape:", user_representation.shape)  # Expected: (batch_size, embed_dim)
    print("Attention Weights Shape:", attention_weights.shape)      # Expected: (batch_size, num_articles)
except Exception as e:
    print(f"Error validating output shapes: {e}")

# Inspect Attention Weights for the First User
try:
    print("\nAttention Weights for User 1:\n", attention_weights[0].detach().numpy())  # Shape: (num_articles)
except Exception as e:
    print(f"Error inspecting attention weights: {e}")
