In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import plotly.express as px
import plotly.graph_objects as go
import json
import numpy as np
import matplotlib.pyplot as plt
import os
from typing import Dict, List, Tuple
from collections import defaultdict

In [2]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        # use_flash_attention_2=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

model, tokenizer = load_model("deepseek-ai/deepseek-moe-16b-base")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
def get_top_k_tokens(hidden_states: torch.Tensor, lm_head: torch.nn.Linear, tokenizer, k: int = 5) -> List[Tuple[str, float]]:
    """ get topk tokens from hidden states using lm head """
    with torch.no_grad():
        logits = lm_head(hidden_states)  # (batch_size, sequence_length, vocab_size)
    
    scores, token_ids = torch.topk(logits, k=k, dim=-1)
    results = []
    for i in range(k):
        token = tokenizer.decode(token_ids[0, -1, i]) 
        score = scores[0, -1, i].item()
        results.append((token, score))
    return results

In [47]:
class DeepseekLayerAnalyzer:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.layer_outputs = defaultdict(list)
        self.moe_gate_outputs = defaultdict(list)
        self.moe_combined_outputs = defaultdict(list)
        self.expert_outputs = defaultdict(lambda: defaultdict(list))
        self.shared_expert_outputs = defaultdict(list)
        self.hooks = []
        
    def register_hooks(self):
        """Register hooks for layer outputs and MoE combination points"""
        
        def layer_output_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing layer outputs"""
                hidden_states = outputs[0] if isinstance(outputs, tuple) else outputs
                self.layer_outputs[layer_idx].append(hidden_states.detach())
            return hook

        def moe_gate_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing MoE gate outputs before expert computation"""
                # Capture topk_idx, topk_weight, and aux_loss from gate outputs
                if isinstance(outputs, tuple):
                    topk_idx, topk_weight, aux_loss = outputs
                    self.moe_gate_outputs[layer_idx].append({
                        'topk_idx': topk_idx.detach(),
                        'topk_weight': topk_weight.detach(),
                        'aux_loss': aux_loss.detach() if aux_loss is not None else None
                    })
            return hook

        def expert_hook(layer_idx, expert_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing individual expert outputs"""
                # Get the latest gate outputs for this layer
                if not self.moe_gate_outputs[layer_idx]:
                    return
                    
                gate_data = self.moe_gate_outputs[layer_idx][-1]
                selected_experts = gate_data['topk_idx'].unique()
                
                # Only record if this expert was actually selected
                if expert_idx in selected_experts:
                    self.expert_outputs[layer_idx][expert_idx].append({
                        'input': inputs[0].detach(),
                        'output': outputs.detach()
                    })
            return hook

        def shared_expert_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing shared expert outputs"""
                self.shared_expert_outputs[layer_idx].append({
                    'input': inputs[0].detach(),
                    'output': outputs.detach()
                })
                print(f"Shared expert output shape: {outputs.shape}")
            return hook

        def moe_combine_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing final combined MoE outputs"""
                # For DeepseekMoE, this captures the weighted sum of expert outputs
                self.moe_combined_outputs[layer_idx].append({
                    'combined_output': outputs.detach(),
                    'input': inputs[0].detach()  # Original input before MoE
                })
                print(f"Combined output shape: {outputs.shape}")
            return hook

        # Register hooks for each layer
        for layer_idx, layer in enumerate(self.model.model.layers):
            # Hook for layer output
            hook = layer.register_forward_hook(layer_output_hook(layer_idx))
            self.hooks.append(hook)
            
            # If it's an MoE layer, add MoE-specific hooks
            if hasattr(layer.mlp, 'experts'):
                # Hook for gate mechanism
                gate_hook = layer.mlp.gate.register_forward_hook(moe_gate_hook(layer_idx))
                self.hooks.append(gate_hook)
                
                # Hook for each expert
                for expert_idx, expert in enumerate(layer.mlp.experts):
                    expert_hook_fn = expert.register_forward_hook(expert_hook(layer_idx, expert_idx))
                    self.hooks.append(expert_hook_fn)

                # Hook for shared expert if it exists
                if hasattr(layer.mlp, 'shared_experts'):
                    shared_hook = layer.mlp.shared_experts.register_forward_hook(shared_expert_hook(layer_idx))
                    self.hooks.append(shared_hook)
                
                # Hook for final combined output
                combine_hook = layer.mlp.register_forward_hook(moe_combine_hook(layer_idx))
                self.hooks.append(combine_hook)

    def analyze_tokens(self, input_ids: torch.Tensor, return_hidden_states: bool = False) -> Dict:
        """ run inference and analyze tokens at each layer and expert combination point """
        # Clear previous results
        self.layer_outputs.clear()
        self.moe_gate_outputs.clear()
        self.moe_combined_outputs.clear()
        self.expert_outputs.clear()
        self.shared_expert_outputs.clear()
        
        # Forward pass
        with torch.no_grad():
            outputs = self.model(input_ids)
        
        results = {
            'layer_predictions': {},
            'moe_analysis': {},
            'hidden_states': {} if return_hidden_states else None
        }
        
        # Analyze layer outputs
        for layer_idx, outputs in self.layer_outputs.items():
            if not outputs:  # Skip if no outputs captured
                continue
            hidden_states = outputs[-1]  # Get last captured output
            
            # Get token predictions for this layer
            top_tokens = get_top_k_tokens(hidden_states, self.model.lm_head, self.tokenizer)
            results['layer_predictions'][layer_idx] = top_tokens
            
            if return_hidden_states:
                results['hidden_states'][f'layer_{layer_idx}'] = hidden_states
        
        # Analyze MoE layers
        for layer_idx in self.moe_gate_outputs.keys():
            if not self.moe_gate_outputs[layer_idx]:
                continue
                
            gate_data = self.moe_gate_outputs[layer_idx][-1]  # Get last captured data
            combined_data = self.moe_combined_outputs[layer_idx][-1]
            
            # Get predictions for each expert output
            expert_predictions = {}
            for expert_idx, data in self.expert_outputs[layer_idx].items():
                if data:  # If expert was used
                    expert_predictions[expert_idx] = get_top_k_tokens(
                        data[-1]['output'],
                        self.model.lm_head,
                        self.tokenizer
                    )
            
            # Get predictions for shared expert if it exists
            # shared_expert_predictions = None
            if self.shared_expert_outputs[layer_idx]:
                shared_expert_predictions = get_top_k_tokens(
                    self.shared_expert_outputs[layer_idx][-1]['output'],
                    self.model.lm_head,
                    self.tokenizer
                )
            
            # Analyze expert selection and combination
            experts_analysis = {
                'selected_experts': gate_data['topk_idx'].tolist(),
                'expert_weights': gate_data['topk_weight'].tolist(),
                'aux_loss': gate_data['aux_loss'].item() if gate_data['aux_loss'] is not None else None,
                'expert_predictions': expert_predictions,
                'shared_expert_predictions': shared_expert_predictions
            }
            
            # Get token predictions from combined output
            combined_tokens = get_top_k_tokens(
                combined_data['combined_output'], 
                self.model.lm_head,
                self.tokenizer
            )
            
            experts_analysis['combined_output_tokens'] = combined_tokens
            results['moe_analysis'][layer_idx] = experts_analysis

        
        return results
    
    def cleanup(self):
        """remove all registered hooks"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()

def analyze_deepseek_moe(model, tokenizer, input_text: str, return_hidden_states: bool = False):
    """ analyze DeepSeek MoE model behavior for given input text """
    # Initialize analyzer
    analyzer = DeepseekLayerAnalyzer(model, tokenizer)
    analyzer.register_hooks()
    
    # Prepare input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    # Run analysis
    try:
        results = analyzer.analyze_tokens(input_ids, return_hidden_states=return_hidden_states)
        return results
    finally:
        analyzer.cleanup()  # Always clean up hooks


In [49]:
analysis = analyze_deepseek_moe(
    model, 
    tokenizer,
    "the quick brown fox"
)

# Access layer predictions
for layer_idx, preds in analysis['layer_predictions'].items():
    print(f"Layer {layer_idx} predictions:", preds)

# Access MoE analysis
for layer_idx, moe_data in analysis['moe_analysis'].items():
    print(f"\nMoE Layer {layer_idx}:")
    print("Selected experts:", moe_data['selected_experts'])
    print("Expert weights:", moe_data['expert_weights'])
    print("Combined output tokens:", moe_data['combined_output_tokens'])
    print("Expert predictions:", moe_data['expert_predictions'])
    print("Shared expert predictions:", moe_data['shared_expert_predictions'])

layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 0 hidden states shape (end of layer): torch.Size([1, 8, 2048])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 gate outputs shape: torch.Size([8, 6])
layer 1 expert 1 output shape: torch.Size([1, 2048])
layer 1 expert 1 ou

In [3]:
def get_moe_metadata(model, input_ids):
    """Get both router logits and expert indices for all MoE layers"""
    router_logits_list = []
    expert_indices_list = []
    hidden_states_list = []
    
    def hook_fn(module, input, output):
        # output contains: (topk_idx, topk_weight, aux_loss)
        hidden_states = input[0]
        
        logits = torch.matmul(hidden_states, module.weight.T)
        router_logits_list.append(logits.detach())
        
        # store expert indices actually used for routing
        expert_indices_list.append(output[0].detach())

        # store the hidden states
        hidden_states_list.append(hidden_states.detach())
        
        return output
    
    hooks = []
    for layer_idx, layer in enumerate(model.model.layers):
        if layer.mlp.__class__.__name__ == 'DeepseekMoE':
            hook = layer.mlp.gate.register_forward_hook(hook_fn)
            hooks.append(hook)

    with torch.no_grad():
        model(input_ids)
    
    for hook in hooks:
        hook.remove()

    moe_metadata = {
        'router_logits': torch.stack(router_logits_list) if router_logits_list else None,
        'expert_indices': torch.stack(expert_indices_list) if expert_indices_list else None,
        'hidden_states': torch.stack(hidden_states_list) if hidden_states_list else None
    }
    
    if moe_metadata['router_logits'] is not None:
        print(f"Router logits shape: {moe_metadata['router_logits'].shape}")
    if moe_metadata['expert_indices'] is not None:
        print(f"Expert indices shape: {moe_metadata['expert_indices'].shape}")
    if moe_metadata['hidden_states'] is not None:
        print(f"Hidden states shape: {moe_metadata['hidden_states'].shape}")
    
    return moe_metadata

In [4]:
def get_expert_outputs(model, moe_metadata):
    """Compute expert outputs for top-k selected experts in all MoE layers"""
    expert_outputs = []
    num_layers = 27
    
    # Get metadata dimensions
    # num_layers = moe_metadata['expert_indices'].shape[0]
    print(f'expert_indices shape: {moe_metadata["expert_indices"].shape}')
    num_tokens = moe_metadata['expert_indices'].shape[1]
    top_k = moe_metadata['expert_indices'].shape[2]
    hidden_dim = moe_metadata['hidden_states'].shape[-1]

    # Pre-allocate tensor: [layers, tokens, top_k, hidden_dim]
    all_expert_outputs = torch.zeros(
        (num_layers, num_tokens, top_k, hidden_dim),
        device=model.device
    )

    for layer_idx in range(num_layers):
        # Get MoE components for current layer
        expert_module = model.model.layers[layer_idx+1].mlp.experts
        layer_hidden_states = moe_metadata['hidden_states'][layer_idx]  # [1, num_tokens, hdim]
        layer_expert_indices = moe_metadata['expert_indices'][layer_idx]  # [num_tokens, top_k]

        for token_idx in range(num_tokens):
            # Get hidden state for this token (remove batch dim)
            hidden_state = layer_hidden_states[0, token_idx]  # [hdim]

            # Get expert indices for this token
            expert_indices = layer_expert_indices[token_idx]

            # Process through each selected expert
            for expert_pos, expert_idx in enumerate(expert_indices):
                expert = expert_module[expert_idx.item()]
                
                # Add batch dimension for processing
                with torch.no_grad():
                    expert_out = expert(hidden_state.unsqueeze(0))  # [1, hdim]
                
                all_expert_outputs[layer_idx, token_idx, expert_pos] = expert_out.squeeze(0)

    print(f"Expert outputs shape: {all_expert_outputs.shape}")
    return all_expert_outputs

In [5]:
def project_expert_outputs(model, expert_outputs):
    """
    Project expert outputs through LM head while maintaining structure
    Returns tensor of shape [num_layers, num_tokens, num_experts, vocab_size]
    """
    # Get model dtype from LM head
    model_dtype = model.lm_head.weight.dtype
    
    # Get original shape details
    num_layers, num_tokens, num_experts, hidden_dim = expert_outputs.shape
    vocab_size = model.lm_head.out_features
    print(f'vocab_size: {vocab_size}')
    # Pre-allocate output tensor using model dtype
    expert_logits = torch.zeros(
        (num_layers, num_tokens, num_experts, vocab_size),
        device=model.device,
        dtype=model_dtype  # Match model's dtype
    )

    # Process each layer, token and expert individually
    for layer_idx in range(num_layers):
        for token_idx in range(num_tokens):
            for expert_idx in range(num_experts):
                # Get expert output and cast to model dtype
                expert_output = expert_outputs[layer_idx, token_idx, expert_idx]
                expert_output = expert_output.to(model_dtype)  # <-- CRITICAL CAST
                
                # Project through LM head
                with torch.no_grad():
                    logits = model.lm_head(expert_output.unsqueeze(0))
                
                # Store result
                expert_logits[layer_idx, token_idx, expert_idx] = logits.squeeze(0)

    print(f"Expert logits shape: {expert_logits.shape}")
    return expert_logits

In [6]:
def get_expert_topk_tokens(expert_logits, tokenizer, k=5):
    """
    Get top-k tokens for each expert at each layer and token position
    Returns nested dictionary:
    {
        layer_idx: {
            token_idx: {
                expert_idx: {
                    'tokens': [decoded tokens],
                    'scores': [corresponding scores],
                    'ids': [token ids]
                }, ...
            }, ...
        }, ...
    }
    """
    num_layers, num_tokens, num_experts, _ = expert_logits.shape
    results = {}

    for layer_idx in range(num_layers):
        layer_results = {}
        for token_idx in range(num_tokens):
            token_results = {}
            for expert_idx in range(num_experts):
                # Get logits for this expert configuration
                expert_logit = expert_logits[layer_idx, token_idx, expert_idx]
                
                # Get top-k predictions
                topk_scores, topk_indices = torch.topk(expert_logit, k)
                
                # Convert to CPU/numpy for decoding
                topk_indices_cpu = topk_indices.cpu().numpy()
                topk_scores_cpu = topk_scores.cpu().numpy()
                
                # Decode tokens
                decoded_tokens = tokenizer.batch_decode(topk_indices_cpu)
                
                token_results[expert_idx] = {
                    'tokens': decoded_tokens,
                    'scores': topk_scores_cpu.tolist(),
                    'ids': topk_indices_cpu.tolist()
                }
            
            layer_results[token_idx] = token_results
        results[layer_idx] = layer_results

    return results

In [7]:
input_txt = "the quick brown fox"
input_ids = tokenizer.encode(input_txt, return_tensors="pt")
moe_metadata = get_moe_metadata(model, input_ids)
expert_outputs = get_expert_outputs(model, moe_metadata)
expert_logits = project_expert_outputs(model, expert_outputs)
expert_topk_tokens = get_expert_topk_tokens(expert_logits, tokenizer)

Router logits shape: torch.Size([27, 1, 5, 64])
Expert indices shape: torch.Size([27, 5, 6])
Hidden states shape: torch.Size([27, 1, 5, 2048])
expert_indices shape: torch.Size([27, 5, 6])
Expert outputs shape: torch.Size([27, 5, 6, 2048])
vocab_size: 102400
Expert logits shape: torch.Size([27, 5, 6, 102400])


In [8]:
print(expert_topk_tokens[26])



In [9]:
def get_shared_expert_outputs(model, input_ids):
    """Get outputs from shared experts in all MoE layers"""
    shared_outputs_list = []
    hidden_states_list = []
    moe_layers = []

    def hook_fn(module, input, output):
        # Capture hidden states entering the MoE layer
        hidden_states = input[0]
        hidden_states_list.append(hidden_states.squeeze(0).detach())
        return output

    hooks = []
    # Identify MoE layers and register hooks
    for layer in model.model.layers:
        if layer.mlp.__class__.__name__ == 'DeepseekMoE':
            moe_layers.append(layer.mlp)
            hook = layer.mlp.gate.register_forward_hook(hook_fn)
            hooks.append(hook)

    # Forward pass to collect hidden states
    with torch.no_grad():
        model(input_ids)
    
    # Remove hooks after forward pass
    for hook in hooks:
        hook.remove()

    # Compute shared expert outputs for each MoE layer
    for layer_idx, moe_layer in enumerate(moe_layers):
        hidden_states = hidden_states_list[layer_idx]
        
        # Get output from shared experts (which is a single DeepseekMLP)
        with torch.no_grad():
            expert_out = moe_layer.shared_experts(hidden_states)
        layer_shared_outputs = [expert_out]
        
        # Stack outputs: [num_shared_experts=1, seq_len, hidden_dim] 
        shared_outputs_list.append(torch.stack(layer_shared_outputs, dim=0))

    # Stack all layer outputs to get shape [num_layers, num_shared_experts=1, seq_len, hidden_dim]
    shared_outputs_tensor = torch.stack(shared_outputs_list, dim=0)

    return {
        'shared_expert_outputs': shared_outputs_tensor,
        'hidden_states': torch.stack(hidden_states_list) if hidden_states_list else None
    }

In [10]:
# Get shared expert outputs separately
shared_data = get_shared_expert_outputs(model, input_ids)

print(shared_data['shared_expert_outputs'].shape)
print(f"Shared expert outputs : {len(shared_data['shared_expert_outputs'])}")
print(f"First layer shared outputs shape: {shared_data['shared_expert_outputs'][0].shape}")

torch.Size([27, 1, 5, 2048])
Shared expert outputs : 27
First layer shared outputs shape: torch.Size([1, 5, 2048])


In [11]:
expert_logits = project_expert_outputs(model, expert_outputs=shared_data['shared_expert_outputs'])
expert_topk_tokens = get_expert_topk_tokens(expert_logits, tokenizer)

vocab_size: 102400


Expert logits shape: torch.Size([27, 1, 5, 102400])


In [12]:
expert_topk_tokens[19][0][4]

{'tokens': ['ა', ' <!--[', 'ა�', 'ELY', '\tandroid'],
 'scores': [1.1953125, 1.1025390625, 1.1005859375, 1.0888671875, 1.0205078125],
 'ids': [46554, 69586, 56166, 70939, 97199]}

In [13]:
def map_to_actual_experts(expert_topk_tokens, moe_metadata):
    """
    Convert top-k positional indices to actual expert indices using routing data
    Returns:
    {
        layer_idx: {
            token_idx: {
                actual_expert_idx: {
                    'tokens': [...],
                    'scores': [...], 
                    'ids': [...]
                }, ...
            }, ...
        }, ...
    }
    """
    actual_expert_dict = {}
    expert_indices = moe_metadata['expert_indices'].cpu().numpy()
    
    for layer_idx in expert_topk_tokens:
        layer_data = expert_topk_tokens[layer_idx]
        actual_layer = {}
        
        # Get actual expert IDs for this layer [num_tokens, top_k]
        layer_expert_ids = expert_indices[layer_idx]
        
        for token_idx in layer_data:
            token_data = layer_data[token_idx]
            actual_token = {}
            
            # Get actual expert IDs for this token [top_k]
            token_expert_ids = layer_expert_ids[token_idx]
            
            for pos_idx in token_data:
                # Map positional index to actual expert ID
                actual_expert_id = int(token_expert_ids[pos_idx])
                actual_token[actual_expert_id] = token_data[pos_idx]
                
            actual_token = dict(sorted(actual_token.items()))
            actual_layer[token_idx] = actual_token
            
        actual_expert_dict[layer_idx] = actual_layer
        
    return actual_expert_dict

In [14]:
actual_expert_tokens = map_to_actual_experts(expert_topk_tokens, moe_metadata)
actual_expert_tokens

{0: {0: {2: {'tokens': ['始终坚持', 'JT', 'LH', '犀', '固执'],
    'scores': [0.290283203125,
     0.287841796875,
     0.285888671875,
     0.28515625,
     0.283203125],
    'ids': [99612, 77201, 96732, 65146, 97881]},
   14: {'tokens': ['钰', 'AGA', 'pei', 'LH', '始终坚持'],
    'scores': [0.31298828125,
     0.2880859375,
     0.281982421875,
     0.276611328125,
     0.275146484375],
    'ids': [94398, 92651, 68396, 96732, 99612]},
   34: {'tokens': ['被迫', 'ISAM', '竣', '乞', 'INSEE'],
    'scores': [0.491455078125,
     0.46728515625,
     0.46484375,
     0.443115234375,
     0.44189453125],
    'ids': [64397, 57610, 69003, 61028, 18832]},
   42: {'tokens': ['裹', '野心', '沾', '琦', '姗'],
    'scores': [0.327392578125,
     0.307373046875,
     0.294921875,
     0.293212890625,
     0.291748046875],
    'ids': [33403, 90530, 40540, 55105, 90230]},
   47: {'tokens': ['HCI', ' dalt', '沾', 'JT', '*:'],
    'scores': [0.317138671875,
     0.314697265625,
     0.300048828125,
     0.298095703125,
    

In [15]:
print(actual_expert_tokens[26][0][2])  # Layer 26, Token 1, Expert 64

KeyError: 2

In [23]:
input_txt = "the quick brown fox"
input_ids = tokenizer.encode(input_txt, return_tensors="pt")

x = model(input_ids, output_hidden_states=True)


torch.Size([1, 5, 2048])


tensor([[ 3.6865e-02,  2.1484e-02, -3.3398e-01,  ...,  1.7578e-02,
         -1.9434e-01,  7.4768e-03],
        [-3.8574e-02, -4.8340e-02, -2.0605e-01,  ...,  2.0996e-02,
         -1.5736e-04, -3.6865e-02],
        [ 4.2969e-02,  6.4453e-02, -1.1768e-01,  ..., -2.8442e-02,
          1.0645e-01, -1.2207e-03],
        [ 8.9844e-02, -4.0771e-02,  7.4219e-02,  ...,  2.0630e-02,
          1.7383e-01,  8.4839e-03],
        [ 2.0020e-02, -1.0107e-01,  6.6895e-02,  ..., -1.3867e-01,
          5.5176e-02,  8.9355e-02]], dtype=torch.float16,
       grad_fn=<SelectBackward0>)

In [27]:
print(x['hidden_states'][15].shape)
y = x['hidden_states'][15][0][-1]
tokens = model.lm_head(y)

tokenizer.decode(tokens.argmax(dim=-1))

for layer in range(27):

    y = x['hidden_states'][layer][0][-1]
    tokens = model.lm_head(y)
    print(tokenizer.decode(tokens.argmax(dim=-1)))

torch.Size([1, 5, 2048])
oment
">:
es
es
croft
croft
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
IEEEeqnarray
issin
IEEEeqnarray
estrat
IEEEeqnarray
IEEEeqnarray
="../_
 rejo
 r
Jump
Jump
Jump
Jump
Jump


In [28]:
# Configuration for activation collection
layer_to_hook = 5  # Change this to your desired layer number (0-based index)
mlp_activations = []

# Define hook function
def mlp_hook(module, module_input, module_output):
    """Store MLP output activations after each forward pass"""
    mlp_activations.append(module_output.detach().cpu())

# Register hook on the specified layer
try:
    target_layer = model.model.layers[layer_to_hook].mlp
    handle = target_layer.register_forward_hook(mlp_hook)
except (AttributeError, IndexError) as e:
    raise ValueError(f"Invalid layer number: {layer_to_hook}") from e

# Run generation
text = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

# Remove hook after use
handle.remove()

# Decode and print results
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:\n", result)

# Print activation information
print(f"\nCollected {len(mlp_activations)} MLP activations from layer {layer_to_hook}")
if len(mlp_activations) > 0:
    print(f"Activation shape: {mlp_activations[0].shape}")

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Generated text:
 An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is a vector of attention scores, which are used to compute a weighted sum of the values. The attention function is used in many applications, including machine translation, question answering, and image captioning.

In this post, we will discuss the attention function in detail. We will start by defining the attention function and then discuss its properties. We will also discuss how the attention function is used in various applications.

## What is Attention Function?

The attention function is a mathematical function that

Collected 100 MLP activations from layer 5
Activation shape: torch.Size([1, 40, 2048])


In [3]:
text = 'the quick brown fox'
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)