In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
from typing import Dict, List, Tuple
from collections import defaultdict

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

if DEVICE.type == "cuda":
    # Print CUDA details
    print(f"CUDA Device: {torch.cuda.get_device_name()}")
    print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
    print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**2:.2f}MB")

Using device: cpu


In [3]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        # use_flash_attention_2=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

model, tokenizer = load_model("deepseek-ai/deepseek-moe-16b-base")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
def get_top_k_tokens(hidden_states: torch.Tensor, lm_head: torch.nn.Linear, tokenizer, k: int = 5) -> List[Tuple[str, float]]:
    """ get topk tokens from hidden states using lm head """
    with torch.no_grad():
        # Move tensors to global device
        hidden_states = hidden_states.to(DEVICE)
        lm_head = lm_head.to(DEVICE)
        
        # Ensure hidden_states has at least 2 dimensions (batch_size, num_tokens, hidden_dim)
        if hidden_states.dim() == 2:
            hidden_states = hidden_states.unsqueeze(0)  # Add batch dimension
            
        # Compute logits in chunks to avoid OOM
        chunk_size = 512  # Adjust based on GPU memory
        num_chunks = (hidden_states.size(1) + chunk_size - 1) // chunk_size
        all_logits = []
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, hidden_states.size(1))
            chunk = hidden_states[:, start_idx:end_idx]
            chunk_logits = lm_head(chunk)
            all_logits.append(chunk_logits)
            
        logits = torch.cat(all_logits, dim=1)  # (batch_size, num_tokens, vocab_size)
    
    # Get top-k tokens
    scores, token_ids = torch.topk(logits, k=k, dim=-1)  # (batch_size, num_tokens, k)
    
    # Move to CPU for decoding
    scores = scores.cpu()
    token_ids = token_ids.cpu()
    
    # Decode tokens and collect results for each position
    results = []
    for pos in range(scores.size(1)):  # Iterate over token positions
        pos_results = []
        for i in range(k):
            token = tokenizer.decode(token_ids[0, pos, i])  # Decode token for this position
            score = scores[0, pos, i].item()  # Get score for this position
            pos_results.append((token, score))
        results.append(pos_results)
    
    # Clear CUDA cache if using GPU
    if DEVICE.type == "cuda":
        torch.cuda.empty_cache()
    
    return results

In [7]:
class DeepseekLayerAnalyzer:
    """ Analyzes the behavior of a DeepSeek MoE model by capturing and analyzing outputs from different layers and experts.
    
    Args:
        model: The DeepSeek MoE model to analyze
        tokenizer: The tokenizer associated with the model
        device: Device to run analysis on ('cuda' or 'cpu')
        
    Attributes:
        layer_outputs (defaultdict): Stores outputs from each model layer
        moe_gate_outputs (defaultdict): Stores gate outputs from MoE layers
        moe_combined_outputs (defaultdict): Stores combined outputs after expert computation
        expert_outputs (defaultdict): Stores individual expert outputs per layer and position
        shared_expert_outputs (defaultdict): Stores outputs from shared experts if present
        hooks (list): List of registered PyTorch hooks
    """
    
    def __init__(self, model, tokenizer, device='cpu'):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = DEVICE
        self.hooks = []
        self.reset_state()
        
        # Check GPU memory if using CUDA
        if device == 'cuda' and torch.cuda.is_available():
            self.gpu_mem = get_device_properties(0).total_memory
            print(f"GPU Memory Available: {self.gpu_mem / 1e9:.2f} GB")
        
    def reset_state(self):
        """Clear all stored state between runs and free GPU memory"""
        self.layer_outputs = defaultdict(list)
        self.moe_gate_outputs = defaultdict(list)
        self.moe_combined_outputs = defaultdict(list)
        self.expert_outputs = defaultdict(lambda: defaultdict(list))
        self.shared_expert_outputs = defaultdict(list)
        
        # Clear CUDA cache if using GPU
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

    def register_hooks(self):
        """Register hooks for layer outputs and MoE combination points"""
        
        def layer_output_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing layer outputs"""
                hidden_states = outputs[0] if isinstance(outputs, tuple) else outputs
                # Store on CPU to save GPU memory
                self.layer_outputs[layer_idx].append(hidden_states.detach().cpu())
            return hook

        def moe_gate_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing MoE gate outputs before expert computation"""
                # Capture topk_idx, topk_weight, and aux_loss from gate outputs
                if isinstance(outputs, tuple):
                    topk_idx, topk_weight, aux_loss = outputs
                    self.moe_gate_outputs[layer_idx].append({
                        'topk_idx': topk_idx.detach().cpu(),
                        'topk_weight': topk_weight.detach().cpu(),
                        'aux_loss': aux_loss.detach().cpu() if aux_loss is not None else None
                    })
            return hook

        def expert_hook(layer_idx, expert_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing expert outputs"""
                # Get the latest gate outputs for this layer
                if not self.moe_gate_outputs[layer_idx]:
                    return print(f'no gate outputs for layer {layer_idx}')
            
                gate_data = self.moe_gate_outputs[layer_idx][-1]
                
                # Handle 2D or 3D tensor shapes
                if len(gate_data['topk_idx'].shape) == 2:
                    batch_size = 1
                    seq_len, top_k = gate_data['topk_idx'].shape
                else:
                    batch_size, seq_len, top_k = gate_data['topk_idx'].shape
                
                # Get mask for tokens where this expert was selected
                expert_mask = (gate_data['topk_idx'] == expert_idx)                
                # Flatten and find positions where this expert was selected
                selected_positions = torch.nonzero(expert_mask, as_tuple=True)
                # If no tokens selected this expert, skip
                if selected_positions[0].numel() == 0:
                    return
                    
                # Get the actual inputs routed to this expert
                # Inputs[0] shape: (total_selected_tokens, hidden_dim)
                total_selected = inputs[0].shape[0] 
                # Validate we're processing the correct number of tokens
                expected_selected = expert_mask.sum().item()
                if total_selected != expected_selected:
                    print(f" expert {expert_idx} processed {total_selected} tokens but expected {expected_selected}")
                    return
                    
                # Get the full hidden states from outputs
                # outputs shape: (total_selected_tokens, hidden_dim)
                hidden_states = outputs
                if isinstance(outputs, tuple):
                    hidden_states = outputs[0]
                    
                # Record data for each selected position
                for pos_idx, pos in enumerate(selected_positions[0]):
                    token_data = {
                        'position': pos.item(),
                        'input': inputs[0][pos_idx].detach().cpu(),
                        'output': outputs[pos_idx].detach().cpu(),
                        'hidden_state': hidden_states[pos_idx].detach().cpu()  # Store full hidden state
                    }
                    
                    # Get the corresponding gate weight for this position
                    # Find which expert slot (in top_k) this expert was selected for this position
                    expert_slots = (gate_data['topk_idx'][pos.item()] == expert_idx).nonzero(as_tuple=True)[0]
                    if len(expert_slots) > 0:
                        token_data['gate_weight'] = gate_data['topk_weight'][pos.item()][expert_slots[0]].item()
                    
                    self.expert_outputs[layer_idx][expert_idx].append(token_data)
            return hook
        
        def shared_expert_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing shared expert outputs"""
                self.shared_expert_outputs[layer_idx].append({
                    'input': inputs[0].detach().cpu(),
                    'output': outputs.detach().cpu()
                })
            return hook

        def moe_combine_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing final combined MoE outputs"""
                # For DeepseekMoE, this captures the weighted sum of expert outputs
                self.moe_combined_outputs[layer_idx].append({
                    'combined_output': outputs.detach().cpu(),
                    'input': inputs[0].detach().cpu()  # Original input before MoE
                })
            return hook

        # Register hooks for each layer
        for layer_idx, layer in enumerate(self.model.model.layers):
            # Hook for layer output
            hook = layer.register_forward_hook(layer_output_hook(layer_idx))
            self.hooks.append(hook)
            
            # If it's an MoE layer, add MoE-specific hooks
            if hasattr(layer.mlp, 'experts'):
                # Hook for gate mechanism
                gate_hook = layer.mlp.gate.register_forward_hook(moe_gate_hook(layer_idx))
                self.hooks.append(gate_hook)
                
                # Hook for each expert
                for expert_idx, expert in enumerate(layer.mlp.experts):
                    expert_hook_fn = expert.register_forward_hook(expert_hook(layer_idx, expert_idx))
                    self.hooks.append(expert_hook_fn)

                # Hook for shared expert if it exists
                if hasattr(layer.mlp, 'shared_experts'):
                    shared_hook = layer.mlp.shared_experts.register_forward_hook(shared_expert_hook(layer_idx))
                    self.hooks.append(shared_hook)
                
                # Hook for final combined output
                combine_hook = layer.mlp.register_forward_hook(moe_combine_hook(layer_idx))
                self.hooks.append(combine_hook)

    def analyze_tokens(self, input_ids: torch.Tensor, return_hidden_states: bool = False) -> Dict:
        """Run inference and analyze tokens at each layer and expert combination point"""
        # Clear all state before processing new input
        self.reset_state()
        
        # Move input to device
        input_ids = input_ids.to(self.device)
        
        # Forward pass
        with torch.no_grad():
            outputs = self.model(input_ids)
        
        results = {
            'layer_predictions': {},
            'moe_analysis': {},
            'hidden_states': {} if return_hidden_states else None
        }
        
        # Analyze layer outputs
        for layer_idx, outputs in self.layer_outputs.items():
            if not outputs:  # Skip if no outputs captured
                continue
            hidden_states = outputs[-1].to(self.device)  # Move back to device for processing
            
            # Get token predictions for this layer
            top_tokens = get_top_k_tokens(hidden_states, self.model.lm_head, self.tokenizer)
            results['layer_predictions'][layer_idx] = top_tokens
            
            if return_hidden_states:
                results['hidden_states'][f'layer_{layer_idx}'] = hidden_states.cpu()  # Store on CPU
        
        # Analyze MoE layers
        for layer_idx in self.moe_gate_outputs.keys():
            if not self.moe_gate_outputs[layer_idx]:
                continue
                
            gate_data = self.moe_gate_outputs[layer_idx][-1]  # Get last captured data
            combined_data = self.moe_combined_outputs[layer_idx][-1]
            
            # Initialize predictions dictionary by position
            expert_predictions_by_pos = defaultdict(dict)
            expert_hidden_states_by_pos = defaultdict(dict)
            
            # Process expert outputs by position
            for expert_idx, data_list in self.expert_outputs[layer_idx].items():
                for data in data_list:
                    position = data['position']
                    # Move data to device for prediction
                    expert_output = data['output'].to(self.device).unsqueeze(0)
                    # Get predictions for this expert's output at this position
                    predictions = get_top_k_tokens(
                        expert_output,
                        self.model.lm_head,
                        self.tokenizer
                    )
                    expert_predictions_by_pos[position][expert_idx] = predictions[0]  # [0] because we only have one prediction set

                    # Store hidden states
                    expert_hidden_states_by_pos[position][expert_idx] = {
                        'hidden_state': data['hidden_state'].tolist(),
                        'gate_weight': data.get('gate_weight', None)
                    }
            
            # Get predictions for shared expert if it exists
            if self.shared_expert_outputs[layer_idx]:
                shared_output = self.shared_expert_outputs[layer_idx][-1]['output'].to(self.device)
                shared_expert_predictions = get_top_k_tokens(
                    shared_output,
                    self.model.lm_head,
                    self.tokenizer
                )
            
            # Update experts_analysis dictionary to include hidden states
            experts_analysis = {
                'selected_experts': gate_data['topk_idx'].tolist(),
                'expert_weights': gate_data['topk_weight'].tolist(),
                'aux_loss': gate_data['aux_loss'].item() if gate_data['aux_loss'] is not None else None,
                'expert_predictions_by_position': dict(expert_predictions_by_pos),
                'expert_hidden_states_by_position': dict(expert_hidden_states_by_pos),
                'shared_expert_predictions': shared_expert_predictions if self.shared_expert_outputs[layer_idx] else None
            }
            
            # Get token predictions from combined output
            combined_output = combined_data['combined_output'].to(self.device)
            combined_tokens = get_top_k_tokens(
                combined_output, 
                self.model.lm_head,
                self.tokenizer
            )
            
            experts_analysis['combined_output_tokens'] = combined_tokens
            results['moe_analysis'][layer_idx] = experts_analysis

        return results
    
    def cleanup(self):
        """Remove all registered hooks and clear state"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()
        self.reset_state()

def analyze_deepseek_moe(model, tokenizer, input_text: str, return_hidden_states: bool = False, device='cpu'):
    """Analyze DeepSeek MoE model behavior for given input text"""
    # Create a fresh analyzer instance for each analysis
    analyzer = DeepseekLayerAnalyzer(model, tokenizer, device=device)
    analyzer.register_hooks()
    
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    try:
        results = analyzer.analyze_tokens(input_ids, return_hidden_states=return_hidden_states)
        return results
    finally:
        analyzer.cleanup()  # Ensure hooks are removed and state is cleared


In [8]:
# Get prompt and tokenize
prompt = "quick brown fox"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
n = input_ids.shape[1]
token_pos = n-1  # Last token

analysis = analyze_deepseek_moe(model, tokenizer, prompt, return_hidden_states=True)

# Create analyzer and get hidden states
analyzer = DeepseekLayerAnalyzer(model, tokenizer)
analyzer.register_hooks()


results = analyzer.analyze_tokens(input_ids, return_hidden_states=True)

In [9]:
def get_post_attn_ln_inputs(model, tokenizer, text, device=DEVICE):
    """places a hook on the post attention layernorm to retrieve its inputs"""
    # Determine device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Store inputs in a dict mapping layer idx -> inputs
    post_attn_ln_inputs = {}
    hooks = []
    
    def hook_post_attn_ln(module, input, output, layer_idx):
        if layer_idx not in post_attn_ln_inputs:
            post_attn_ln_inputs[layer_idx] = []
        # Detach and move to CPU to save memory if using CUDA
        if device == 'cuda':
            post_attn_ln_inputs[layer_idx].append([x.detach().cpu() for x in input])
        else:
            post_attn_ln_inputs[layer_idx].append([x.detach() for x in input])
    
    # Register hooks on post attention layernorm for each layer
    for i, layer in enumerate(model.model.layers):
        hooks.append(
            layer.post_attention_layernorm.register_forward_hook(
                lambda m, i, o, idx=i: hook_post_attn_ln(m, i, o, idx)
            )
        )
    
    try:
        # Run inference
        input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
        if device == 'cuda':
            with torch.cuda.amp.autocast():  # Use mixed precision for CUDA
                model(input_ids)
            torch.cuda.empty_cache()  # Clear CUDA cache
        else:
            model(input_ids)
        
        return post_attn_ln_inputs
        
    finally:
        # Clean up hooks
        for hook in hooks:
            hook.remove()

In [10]:
post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=prompt)

In [11]:
def get_expert_hidden_states_by_weight(analysis, post_attn_ln_inputs, token_pos, k=0):
    """Gets hidden states (with residual added) from the k-th highest weighted expert for all MoE layers"""
    hidden_states_by_layer = {}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # For each layer that has MoE
    for layer_idx in analysis['moe_analysis'].keys():
        moe_analysis = analysis['moe_analysis'][layer_idx]
        
        # Get expert weights for this token
        expert_weights = {}
        selected_experts = moe_analysis['selected_experts'][token_pos]
        expert_weights_list = moe_analysis['expert_weights'][token_pos]
        
        # Map experts to their weights
        for expert_idx, weight in zip(selected_experts, expert_weights_list):
            expert_weights[expert_idx] = weight
            
        # Sort by weight and get k-th highest
        sorted_experts = sorted(expert_weights.items(), key=lambda x: x[1], reverse=True)
        if k >= len(sorted_experts):
            print(f"Warning: Layer {layer_idx} only has {len(sorted_experts)} experts, but k={k} requested")
            continue
            
        target_expert = sorted_experts[k][0]  # Get the k-th expert's id
        
        # Get hidden state for this expert
        expert_hidden_state = moe_analysis['expert_hidden_states_by_position'][token_pos][target_expert]['hidden_state']
        
        # Get residual stream for this token
        try:
            residual = post_attn_ln_inputs[layer_idx][-1][0][0][token_pos]  # Added [0] index
        except (KeyError, IndexError) as e:
            print(f"Warning: Could not get residual for layer {layer_idx}, skipping. Error: {e}")
            continue
            
        # Convert to tensor if needed and move to appropriate device
        if isinstance(expert_hidden_state, list):
            expert_hidden_state = torch.tensor(expert_hidden_state, dtype=torch.float16, device=device)
        elif isinstance(expert_hidden_state, torch.Tensor):
            expert_hidden_state = expert_hidden_state.to(device=device, dtype=torch.float16)
            
        # Move residual to same device and dtype
        residual = residual.to(device=device, dtype=torch.float16)
            
        # Add residual using mixed precision if on CUDA
        if device == 'cuda':
            with torch.cuda.amp.autocast():
                combined = residual + expert_hidden_state
        else:
            combined = residual + expert_hidden_state
            
        hidden_states_by_layer[layer_idx] = combined
        
    return hidden_states_by_layer

In [12]:
# Get final layer index and hidden state
final_layer_idx = max([int(k) for k in results["layer_predictions"].keys()])
final_hidden_state = analyzer.layer_outputs[final_layer_idx][-1][0][token_pos]

print("Shape of final hidden state :", final_hidden_state)

Shape of final hidden state : tensor([-1.8672, -1.1270,  0.4009,  ..., -1.5059,  0.0342, -2.5078],
       dtype=torch.float16)


In [13]:

n = len(tokenizer.encode(prompt))
token_pos = n-1  # Last token
k = 0  # Get highest weighted expert

hidden_states = get_expert_hidden_states_by_weight(analysis,
                                                   post_attn_ln_inputs=post_attn_ln_inputs,
                                                   token_pos=token_pos,
                                                   k=k)

print(f'hidden_states : {hidden_states}')

hidden_states : {1: tensor([ 0.0326, -0.1321,  0.2737,  ..., -0.1681,  0.0611,  0.0240],
       dtype=torch.float16), 2: tensor([-0.1166, -0.3586,  0.1572,  ..., -0.1925,  0.0962,  0.0130],
       dtype=torch.float16), 3: tensor([-0.1203, -0.3545,  0.1521,  ..., -0.0521,  0.3518, -0.3347],
       dtype=torch.float16), 4: tensor([-0.0322, -0.5200,  0.1499,  ..., -0.0738,  0.2411, -0.0245],
       dtype=torch.float16), 5: tensor([-0.0688, -0.5557,  0.0140,  ..., -0.0708,  0.3848, -0.2874],
       dtype=torch.float16), 6: tensor([-0.1906, -0.4470, -0.2214,  ..., -0.1390,  0.3452, -0.4048],
       dtype=torch.float16), 7: tensor([-0.3516, -0.4438, -0.2330,  ..., -0.0925,  0.4446, -0.5254],
       dtype=torch.float16), 8: tensor([-0.3779, -0.4673, -0.3933,  ...,  0.1520,  0.3298, -0.2668],
       dtype=torch.float16), 9: tensor([-0.5723, -0.3511, -0.4016,  ...,  0.2032,  0.2272, -0.1050],
       dtype=torch.float16), 10: tensor([-0.6016, -0.9678, -0.5117,  ...,  0.5195, -0.2944, -0.2246],
 

In [14]:
import torch.nn.functional as F

# cosine similarity for each layer
sim = {}
for layer_idx, layer_hidden in hidden_states.items():
    # Normalize both vectors for cosine similarity
    similarity = F.cosine_similarity(final_hidden_state.unsqueeze(0),
                                   layer_hidden.unsqueeze(0))
    sim[layer_idx] = similarity.item()

# Print results sorted by layer
for layer_idx in sorted(sim.keys()):
    print(f"Layer {layer_idx} similarity: {sim[layer_idx]:.4f}")

Layer 1 similarity: 0.0709
Layer 2 similarity: 0.0688
Layer 3 similarity: 0.0408
Layer 4 similarity: 0.0459
Layer 5 similarity: 0.0466
Layer 6 similarity: 0.0602
Layer 7 similarity: 0.0601
Layer 8 similarity: 0.0609
Layer 9 similarity: 0.0849
Layer 10 similarity: 0.1040
Layer 11 similarity: 0.1093
Layer 12 similarity: 0.1161
Layer 13 similarity: 0.1243
Layer 14 similarity: 0.1267
Layer 15 similarity: 0.1420
Layer 16 similarity: 0.1436
Layer 17 similarity: 0.1578
Layer 18 similarity: 0.1534
Layer 19 similarity: 0.1505
Layer 20 similarity: 0.1613
Layer 21 similarity: 0.1775
Layer 22 similarity: 0.1823
Layer 23 similarity: 0.1899
Layer 24 similarity: 0.2017
Layer 25 similarity: 0.2255
Layer 26 similarity: 0.3474
Layer 27 similarity: 0.7798


In [15]:
import plotly.graph_objects as go

def plot_cosine_similarities(similarities, k):
    """
    Create an interactive line plot of cosine similarities across layers.
    
    Args:
    similarities: dict with layer_idx -> cosine_similarity_value
    k: which expert (by weight rank) was used
    """
    # Set plot size
    plot_size = 500  # Size in pixels for both width and height
    
    # Sort layers for x-axis
    layers = sorted(similarities.keys())
    sim_values = [similarities[layer] for layer in layers]
    
    # Create figure
    fig = go.Figure()
    
    # Add line plot
    fig.add_trace(go.Scatter(
        x=layers,
        y=sim_values,
        mode='lines+markers',
        name=f'Expert rank {k}',
        hovertemplate=
        "<b>Layer %{x}</b><br>" +
        "Cosine Similarity: %{y:.4f}<br>" +
        "<extra></extra>",  # Removes secondary box
        line=dict(width=2),
        marker=dict(size=8)
    ))
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=f'Cosine Similarity with Final Hidden State<br><sup>Using {k+1}th highest weighted expert per layer</sup>',
            x=0.5,  # Center title
            y=0.95
        ),
        xaxis=dict(
            title='Layer',
            gridcolor='rgba(128,128,128,0.2)',
            tickmode='linear',
            dtick=1,  # Show every layer number
            range=[0, 27.5]  # Set x-axis range from 0 to 27
        ),
        yaxis=dict(
            title='Cosine Similarity',
            gridcolor='rgba(128,128,128,0.2)',
            range=[-0.2, 1]  # Updated range for cosine similarity
        ),
        plot_bgcolor='white',
        hovermode='x unified',  # Shows all points at a given x-coordinate
        showlegend=False,
        width=plot_size,
        height=plot_size
    )
    
    # Add grid
    fig.update_xaxes(showgrid=True, gridwidth=1)
    fig.update_yaxes(showgrid=True, gridwidth=1)
    
    return fig

# Use the function
fig = plot_cosine_similarities(sim, k)
fig.show()

In [16]:
import torch
import torch.nn.functional as F
import json
from typing import Dict, List
import numpy as np
from tqdm import tqdm
import pandas as pd

def analyze_multiple_samples(model, tokenizer, domain: str, samples: List[str]):
    """Analyze multiple samples and return average cosine similarities per layer"""
    all_similarities = []
    
    for i, sample in enumerate(tqdm(samples, desc="Analyzing samples")):
        # Get prompt and tokenize
        input_ids = tokenizer(sample, return_tensors="pt").input_ids
        n = input_ids.shape[1]
        token_pos = n-1  # Last token
        
        # Run analysis
        analysis = analyze_deepseek_moe(model, tokenizer, sample, return_hidden_states=True)
        
        # Get post attention layer norm inputs
        post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=sample)
        
        # Get final layer index and hidden state
        final_layer_idx = max([int(k) for k in results["layer_predictions"].keys()])
        final_hidden_state = analyzer.layer_outputs[final_layer_idx][-1][0][token_pos]
        
        # Get hidden states for highest weighted expert (k=0)
        hidden_states = get_expert_hidden_states_by_weight(
            analysis,
            post_attn_ln_inputs=post_attn_ln_inputs,
            token_pos=token_pos,
            k=0
        )
        
        # Calculate cosine similarities for this sample
        similarities = {}
        for layer_idx, layer_hidden in hidden_states.items():
            similarity = F.cosine_similarity(
                final_hidden_state.unsqueeze(0),
                layer_hidden.unsqueeze(0)
            )
            similarities[layer_idx] = similarity.item()
            
        all_similarities.append(similarities)
        
        # Save individual sample results to CSV
        df = pd.DataFrame({
            'layer': list(similarities.keys()),
            'cosine_similarity': list(similarities.values())
        })
        df.to_csv(f'cosine-sim/csv/{domain}_{i+1}_similarities.csv', index=False)
    
    # Calculate average similarities per layer
    avg_similarities = {}
    for layer_idx in all_similarities[0].keys():  # Use first sample's layers as reference
        layer_values = [sim[layer_idx] for sim in all_similarities]
        avg_similarities[layer_idx] = np.mean(layer_values)
    
    # Save average similarities to CSV
    avg_df = pd.DataFrame({
        'layer': list(avg_similarities.keys()),
        f'average_cosine_similarity_{domain}': list(avg_similarities.values())
    })
    avg_df.to_csv(f'cosine-sim/csv/average_similarities_{domain}.csv', index=False)
        
    return avg_similarities, all_similarities

async def process_samples(model, tokenizer, domain="test"):
    # Read and parse JSON file
    with open(f'interp-data/{domain}.json', 'r', encoding='utf-8') as f:
        samples_data = json.load(f)
    
    # Get samples for specified domain
    samples = samples_data.get(domain, [])
    if not samples:
        raise ValueError(f"No samples found for domain: {domain}")
        
    # Run analysis
    avg_sims, all_sims = analyze_multiple_samples(model, tokenizer,domain, samples)
    
    # Print average similarities
    print("\nAverage Cosine Similarities per Layer:")
    for layer_idx in sorted(avg_sims.keys()):
        print(f"Layer {layer_idx}: {avg_sims[layer_idx]:.4f}")
        
    return avg_sims, all_sims

# Optional: Add visualization for individual samples and average
def plot_multi_sample_similarities(all_similarities, avg_similarities):
    """
    Create an interactive plot showing individual sample similarities and their average
    """
    fig = go.Figure()
    
    # Plot individual samples
    layers = sorted(avg_similarities.keys())
    for i, sample_sims in enumerate(all_similarities):
        sim_values = [sample_sims[layer] for layer in layers]
        fig.add_trace(go.Scatter(
            x=layers,
            y=sim_values,
            mode='lines',
            name=f'Sample {i+1}',
            opacity=0.3,
            hoverinfo='skip'  # Disable hover for individual samples
        ))
    
    # Plot average
    avg_values = [avg_similarities[layer] for layer in layers]
    fig.add_trace(go.Scatter(
        x=layers,
        y=avg_values,
        mode='lines+markers',
        name='Average',
        line=dict(width=3, color='black'),
        marker=dict(size=8),
        hovertemplate="<b>Layer %{x}</b><br>Average Similarity: %{y:.4f}<extra></extra>"
    ))
    
    fig.update_layout(
        title='Cosine Similarities Across Samples',
        xaxis_title='Layer',
        yaxis_title='Cosine Similarity',
        yaxis_range=[-0.2, 1],
        showlegend=True
    )
    
    return fig

In [18]:
# Initialize model and tokenizer as before
analyzer = DeepseekLayerAnalyzer(model, tokenizer)
analyzer.register_hooks()
domain = "test"

# Process samples
avg_sims, all_sims = await process_samples(model, 
                                           tokenizer,
                                           domain=domain)

Analyzing samples: 100%|██████████| 3/3 [00:23<00:00,  7.84s/it]


Average Cosine Similarities per Layer:
Layer 1: 0.0258
Layer 2: 0.0366
Layer 3: 0.0311
Layer 4: 0.0374
Layer 5: 0.0284
Layer 6: 0.0334
Layer 7: 0.0396
Layer 8: 0.0328
Layer 9: 0.0424
Layer 10: 0.0462
Layer 11: 0.0485
Layer 12: 0.0497
Layer 13: 0.0527
Layer 14: 0.0572
Layer 15: 0.0649
Layer 16: 0.0656
Layer 17: 0.0932
Layer 18: 0.1040
Layer 19: 0.1208
Layer 20: 0.1245
Layer 21: 0.1434
Layer 22: 0.1436
Layer 23: 0.1643
Layer 24: 0.1843
Layer 25: 0.2209
Layer 26: 0.2225
Layer 27: 0.7700





In [19]:
fig = plot_multi_sample_similarities(all_sims, avg_sims)
fig.write_image(f"cosine-sim/cosine-sim-plot-{domain}.png")
fig.write_html(f"cosine-sim/cosine-sim-plot-{domain}.html")
fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
