In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
from typing import Dict, List, Tuple
from collections import defaultdict
import gc
import glob
import json
from tqdm.auto import tqdm
from collections import defaultdict
import pandas as pd

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

if DEVICE.type == "cuda":
    # Print CUDA details
    print(f"CUDA Device: {torch.cuda.get_device_name()}")
    print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
    print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**2:.2f}MB")

In [None]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        # use_flash_attention_2=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

model, tokenizer = load_model("deepseek-ai/deepseek-moe-16b-base")

In [4]:
def get_top_k_tokens(hidden_states: torch.Tensor, lm_head: torch.nn.Linear, tokenizer, k: int = 5) -> List[Tuple[str, float]]:
    """ get topk tokens from hidden states using lm head """
    with torch.no_grad():
        # Move tensors to device if needed
        if hidden_states.device != DEVICE:
            hidden_states = hidden_states.to(DEVICE)
        if next(lm_head.parameters()).device != DEVICE:
            lm_head = lm_head.to(DEVICE)
        
        # Ensure hidden_states has at least 2 dimensions (batch_size, num_tokens, hidden_dim)
        if hidden_states.dim() == 2:
            hidden_states = hidden_states.unsqueeze(0)  # Add batch dimension
            
        # Compute logits in chunks to avoid OOM
        chunk_size = 128 if DEVICE.type == "cuda" else 32  # Smaller chunks for CPU
        num_chunks = (hidden_states.size(1) + chunk_size - 1) // chunk_size
        all_logits = []
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, hidden_states.size(1))
            chunk = hidden_states[:, start_idx:end_idx]
            chunk_logits = lm_head(chunk)
            all_logits.append(chunk_logits)
            
            # Free memory after each chunk if using CUDA
            if DEVICE.type == "cuda":
                torch.cuda.empty_cache()
            
        logits = torch.cat(all_logits, dim=1)  # (batch_size, num_tokens, vocab_size)
    
        # Get top-k tokens
        scores, token_ids = torch.topk(logits, k=k, dim=-1)  # (batch_size, num_tokens, k)
    
        # Move to CPU for decoding
        scores = scores.cpu()
        token_ids = token_ids.cpu()
        
        # Free GPU memory
        if DEVICE.type == "cuda":
            del logits, all_logits
            torch.cuda.empty_cache()
    
        # Decode tokens and collect results for each position
        results = []
        for pos in range(scores.size(1)):  # Iterate over token positions
            pos_results = []
            for i in range(k):
                token = tokenizer.decode(token_ids[0, pos, i])  # Decode token for this position
                score = scores[0, pos, i].item()  # Get score for this position
                pos_results.append((token, score))
            results.append(pos_results)
    
    return results

In [5]:
class DeepseekLayerAnalyzer:
    """ Analyzes the behavior of a DeepSeek MoE model by capturing and analyzing outputs from different layers and experts.
    
    Args:
        model: The DeepSeek MoE model to analyze
        tokenizer: The tokenizer associated with the model
        device: Device to run analysis on ('cuda' or 'cpu')
        
    Attributes:
        layer_outputs (defaultdict): Stores outputs from each model layer
        moe_gate_outputs (defaultdict): Stores gate outputs from MoE layers
        moe_combined_outputs (defaultdict): Stores combined outputs after computation
        expert_outputs (defaultdict): Stores individual expert outputs per layer and position
        shared_expert_outputs (defaultdict): Stores outputs from shared experts if present
        hooks (list): List of registered PyTorch hooks
    """
    
    def __init__(self, model, tokenizer, device='cpu'):
        self.device = device
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.hooks = []
        self.reset_state()
        
        # Check GPU memory if using CUDA
        if self.device == 'cuda' and torch.cuda.is_available():
            self.gpu_mem = torch.cuda.get_device_properties(0).total_memory
        
    def reset_state(self):
        """Clear all stored state between runs and free GPU memory"""
        self.layer_outputs = defaultdict(list)
        self.moe_gate_outputs = defaultdict(list)
        self.moe_combined_outputs = defaultdict(list)
        self.expert_outputs = defaultdict(lambda: defaultdict(list))
        self.shared_expert_outputs = defaultdict(list)
        
        # Clear CUDA cache if using GPU
        if self.device == 'cuda' and torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

    def register_hooks(self):
        """Register hooks for layer outputs and MoE combination points"""
        
        def layer_output_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing layer outputs"""
                hidden_states = outputs[0] if isinstance(outputs, tuple) else outputs
                # Store on device
                self.layer_outputs[layer_idx].append(hidden_states.detach())
            return hook

        def moe_gate_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing MoE gate outputs before expert computation"""
                # Capture topk_idx, topk_weight, and aux_loss from gate outputs
                if isinstance(outputs, tuple):
                    topk_idx, topk_weight, aux_loss = outputs
                    self.moe_gate_outputs[layer_idx].append({
                        'topk_idx': topk_idx.detach(),
                        'topk_weight': topk_weight.detach(),
                        'aux_loss': aux_loss.detach() if aux_loss is not None else None
                    })
            return hook

        def expert_hook(layer_idx, expert_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing expert outputs"""
                # Get the latest gate outputs for this layer
                if not self.moe_gate_outputs[layer_idx]:
                    return print(f'no gate outputs for layer {layer_idx}')
            
                gate_data = self.moe_gate_outputs[layer_idx][-1]
                
                # Handle 2D or 3D tensor shapes
                if len(gate_data['topk_idx'].shape) == 2:
                    batch_size = 1
                    seq_len, top_k = gate_data['topk_idx'].shape
                else:
                    batch_size, seq_len, top_k = gate_data['topk_idx'].shape
                
                # Get mask for tokens where this expert was selected
                expert_mask = (gate_data['topk_idx'] == expert_idx)                
                # Flatten and find positions where this expert was selected
                selected_positions = torch.nonzero(expert_mask, as_tuple=True)
                # If no tokens selected this expert, skip
                if selected_positions[0].numel() == 0:
                    return
                    
                # Get the actual inputs routed to this expert
                # Inputs[0] shape: (total_selected_tokens, hidden_dim)
                total_selected = inputs[0].shape[0] 
                # Validate we're processing the correct number of tokens
                expected_selected = expert_mask.sum().item()
                if total_selected != expected_selected:
                    print(f" expert {expert_idx} processed {total_selected} tokens but expected {expected_selected}")
                    return
                    
                # Get the full hidden states from outputs
                # outputs shape: (total_selected_tokens, hidden_dim)
                hidden_states = outputs
                if isinstance(outputs, tuple):
                    hidden_states = outputs[0]
                    
                # Record data for each selected position
                for pos_idx, pos in enumerate(selected_positions[0]):
                    token_data = {
                        'position': pos.item(),
                        'input': inputs[0][pos_idx].detach(),
                        'output': outputs[pos_idx].detach(),
                        'hidden_state': hidden_states[pos_idx].detach()
                    }
                    
                    # Get the corresponding gate weight for this position
                    # Find which expert slot (in top_k) this expert was selected for this position
                    expert_slots = (gate_data['topk_idx'][pos.item()] == expert_idx).nonzero(as_tuple=True)[0]
                    if len(expert_slots) > 0:
                        token_data['gate_weight'] = gate_data['topk_weight'][pos.item()][expert_slots[0]].item()
                    
                    self.expert_outputs[layer_idx][expert_idx].append(token_data)
            return hook
        
        def shared_expert_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing shared expert outputs"""
                self.shared_expert_outputs[layer_idx].append({
                    'input': inputs[0].detach(),
                    'output': outputs.detach()
                })
            return hook

        def moe_combine_hook(layer_idx):
            def hook(module, inputs, outputs):
                """Hook for capturing final combined MoE outputs"""
                # For DeepseekMoE, this captures the weighted sum of expert outputs
                self.moe_combined_outputs[layer_idx].append({
                    'combined_output': outputs.detach(),
                    'input': inputs[0].detach()  # Original input before MoE
                })
            return hook

        # Register hooks for each layer
        for layer_idx, layer in enumerate(self.model.model.layers):
            # Hook for layer output
            hook = layer.register_forward_hook(layer_output_hook(layer_idx))
            self.hooks.append(hook)
            
            # If it's an MoE layer, add MoE-specific hooks
            if hasattr(layer.mlp, 'experts'):
                # Hook for gate mechanism
                gate_hook = layer.mlp.gate.register_forward_hook(moe_gate_hook(layer_idx))
                self.hooks.append(gate_hook)
                
                # Hook for each expert
                for expert_idx, expert in enumerate(layer.mlp.experts):
                    expert_hook_fn = expert.register_forward_hook(expert_hook(layer_idx, expert_idx))
                    self.hooks.append(expert_hook_fn)

                # Hook for shared expert if it exists
                if hasattr(layer.mlp, 'shared_experts'):
                    shared_hook = layer.mlp.shared_experts.register_forward_hook(shared_expert_hook(layer_idx))
                    self.hooks.append(shared_hook)
                
                # Hook for final combined output
                combine_hook = layer.mlp.register_forward_hook(moe_combine_hook(layer_idx))
                self.hooks.append(combine_hook)

    def analyze_tokens(self, input_ids: torch.Tensor, return_hidden_states: bool = False) -> Dict:
        """Run inference and analyze tokens at each layer and expert combination point"""
        # Clear all state before processing new input
        self.reset_state()
        
        # Move input to device
        input_ids = input_ids.to(self.device)
        
        # Forward pass
        with torch.no_grad():
            outputs = self.model(input_ids)
        
        results = {
            'layer_predictions': {},
            'moe_analysis': {},
            'hidden_states': {} if return_hidden_states else None
        }
        
        # Analyze layer outputs
        for layer_idx, outputs in self.layer_outputs.items():
            if not outputs:  # Skip if no outputs captured
                continue
            hidden_states = outputs[-1]  # Keep on device
            
            # Get token predictions for this layer
            top_tokens = get_top_k_tokens(hidden_states, self.model.lm_head, self.tokenizer)
            results['layer_predictions'][layer_idx] = top_tokens
            
            if return_hidden_states:
                results['hidden_states'][f'layer_{layer_idx}'] = hidden_states
        
        # Analyze MoE layers
        for layer_idx in self.moe_gate_outputs.keys():
            if not self.moe_gate_outputs[layer_idx]:
                continue
                
            gate_data = self.moe_gate_outputs[layer_idx][-1]
            combined_data = self.moe_combined_outputs[layer_idx][-1]
            
            # Initialize predictions dictionary by position
            expert_predictions_by_pos = defaultdict(dict)
            expert_hidden_states_by_pos = defaultdict(dict)
            
            # Process expert outputs by position
            for expert_idx, data_list in self.expert_outputs[layer_idx].items():
                for data in data_list:
                    position = data['position']
                    expert_output = data['output'].unsqueeze(0)
                    # Get predictions for this expert's output at this position
                    predictions = get_top_k_tokens(
                        expert_output,
                        self.model.lm_head,
                        self.tokenizer
                    )
                    expert_predictions_by_pos[position][expert_idx] = predictions[0]

                    expert_hidden_states_by_pos[position][expert_idx] = {
                        'hidden_state': data['hidden_state'].tolist(),
                        'gate_weight': data.get('gate_weight', None)
                    }
            
            # Get predictions for shared expert if it exists
            if self.shared_expert_outputs[layer_idx]:
                shared_output = self.shared_expert_outputs[layer_idx][-1]['output']
                shared_expert_predictions = get_top_k_tokens(
                    shared_output,
                    self.model.lm_head,
                    self.tokenizer
                )
            
            experts_analysis = {
                'selected_experts': gate_data['topk_idx'].tolist(),
                'expert_weights': gate_data['topk_weight'].tolist(),
                'aux_loss': gate_data['aux_loss'].item() if gate_data['aux_loss'] is not None else None,
                'expert_predictions_by_position': dict(expert_predictions_by_pos),
                'expert_hidden_states_by_position': dict(expert_hidden_states_by_pos),
                'shared_expert_predictions': shared_expert_predictions if self.shared_expert_outputs[layer_idx] else None
            }
            
            # Get token predictions from combined output
            combined_output = combined_data['combined_output']
            combined_tokens = get_top_k_tokens(
                combined_output, 
                self.model.lm_head,
                self.tokenizer
            )
            
            experts_analysis['combined_output_tokens'] = combined_tokens
            results['moe_analysis'][layer_idx] = experts_analysis

        return results
    
    def cleanup(self):
        """Remove all registered hooks and clear state"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()
        self.reset_state()

def analyze_deepseek_moe(model, tokenizer, input_text: str, return_hidden_states: bool = False, device='cpu'):
    """Analyze DeepSeek MoE model behavior for given input text"""
    # Create a fresh analyzer instance for each analysis
    analyzer = DeepseekLayerAnalyzer(model, tokenizer, device=device)
    analyzer.register_hooks()
    
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    try:
        results = analyzer.analyze_tokens(input_ids, return_hidden_states=return_hidden_states)
        return results
    finally:
        analyzer.cleanup()  # Ensure hooks are removed and state is cleared


In [6]:
# Get prompt and tokenize
prompt = "quick brown fox"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
n = input_ids.shape[1]
token_pos = n-1  # Last token

analysis = analyze_deepseek_moe(model, tokenizer, prompt, return_hidden_states=True)

# Create analyzer and get hidden states
analyzer = DeepseekLayerAnalyzer(model, tokenizer)
analyzer.register_hooks()


results = analyzer.analyze_tokens(input_ids, return_hidden_states=True)

In [6]:
def get_post_attn_ln_inputs(model, tokenizer, text, device=DEVICE):
    """places a hook on the post attention layernorm to retrieve its inputs"""
    # Store inputs in a dict mapping layer idx -> inputs
    post_attn_ln_inputs = {}
    hooks = []
    
    def hook_post_attn_ln(module, input, output, layer_idx):
        if layer_idx not in post_attn_ln_inputs:
            post_attn_ln_inputs[layer_idx] = []
        # Detach and move to CPU to save memory if using CUDA
        if device == 'cuda':
            post_attn_ln_inputs[layer_idx].append([x.detach().cpu() for x in input])
        else:
            post_attn_ln_inputs[layer_idx].append([x.detach() for x in input])
    
    # Register hooks on post attention layernorm for each layer
    for i, layer in enumerate(model.model.layers):
        hooks.append(
            layer.post_attention_layernorm.register_forward_hook(
                lambda m, i, o, idx=i: hook_post_attn_ln(m, i, o, idx)
            )
        )
    
    try:
        # Move input to device
        input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
        
        # Ensure model is on correct device
        model = model.to(device)
        
        if device == 'cuda':
            with torch.cuda.amp.autocast():  # Use mixed precision for CUDA
                model(input_ids)
            torch.cuda.empty_cache()  # Clear CUDA cache
        else:
            model(input_ids)
        
        return post_attn_ln_inputs
        
    finally:
        # Clean up hooks
        for hook in hooks:
            hook.remove()

In [8]:
post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=prompt)

In [7]:
def get_expert_hidden_states_by_weight(analysis, post_attn_ln_inputs, token_pos, k=0):
    """Gets hidden states (with residual added) from the k-th highest weighted expert for all MoE layers"""
    hidden_states_by_layer = {}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # For each layer that has MoE
    for layer_idx in analysis['moe_analysis'].keys():
        moe_analysis = analysis['moe_analysis'][layer_idx]
        
        # Get expert weights for this token
        expert_weights = {}
        selected_experts = moe_analysis['selected_experts'][token_pos]
        expert_weights_list = moe_analysis['expert_weights'][token_pos]
        
        # Map experts to their weights
        for expert_idx, weight in zip(selected_experts, expert_weights_list):
            expert_weights[expert_idx] = weight
            
        # Sort by weight and get k-th highest
        sorted_experts = sorted(expert_weights.items(), key=lambda x: x[1], reverse=True)
        if k >= len(sorted_experts):
            print(f"Warning: Layer {layer_idx} only has {len(sorted_experts)} experts, but k={k} requested")
            continue
            
        target_expert = sorted_experts[k][0]  # Get the k-th expert's id
        
        # Get hidden state for this expert
        expert_hidden_state = moe_analysis['expert_hidden_states_by_position'][token_pos][target_expert]['hidden_state']
        
        # Get residual stream for this token
        try:
            residual = post_attn_ln_inputs[layer_idx][-1][0][0][token_pos]  # Added [0] index
        except (KeyError, IndexError) as e:
            print(f"Warning: Could not get residual for layer {layer_idx}, skipping. Error: {e}")
            continue
            
        # Convert to tensor if needed and move to appropriate device
        if isinstance(expert_hidden_state, list):
            expert_hidden_state = torch.tensor(expert_hidden_state, dtype=torch.float16, device=device)
        elif isinstance(expert_hidden_state, torch.Tensor):
            expert_hidden_state = expert_hidden_state.to(device=device, dtype=torch.float16)
            
        # Move residual to same device and dtype
        residual = residual.to(device=device, dtype=torch.float16)
            
        # Add residual using mixed precision if on CUDA
        if device == 'cuda':
            with torch.cuda.amp.autocast():
                combined = residual + expert_hidden_state
        else:
            combined = residual + expert_hidden_state
            
        hidden_states_by_layer[layer_idx] = combined
        
    return hidden_states_by_layer

In [None]:
# Get final layer index and hidden state
final_layer_idx = max([int(k) for k in results["layer_predictions"].keys()])
final_hidden_state = analyzer.layer_outputs[final_layer_idx][-1][0][token_pos]

print("Shape of final hidden state :", final_hidden_state)

In [None]:

n = len(tokenizer.encode(prompt))
token_pos = n-1  # Last token
k = 0  # Get highest weighted expert

hidden_states = get_expert_hidden_states_by_weight(analysis,
                                                   post_attn_ln_inputs=post_attn_ln_inputs,
                                                   token_pos=token_pos,
                                                   k=k)

print(f'hidden_states : {hidden_states}')

In [None]:
# cosine similarity for each layer
sim = {}
for layer_idx, layer_hidden in hidden_states.items():
    # Normalize both vectors for cosine similarity
    similarity = F.cosine_similarity(final_hidden_state.unsqueeze(0),
                                   layer_hidden.unsqueeze(0))
    sim[layer_idx] = similarity.item()

# Print results sorted by layer
for layer_idx in sorted(sim.keys()):
    print(f"Layer {layer_idx} similarity: {sim[layer_idx]:.4f}")

In [None]:
def plot_cosine_similarities(similarities, k):
    """
    Create an interactive line plot of cosine similarities across layers.
    
    Args:
    similarities: dict with layer_idx -> cosine_similarity_value
    k: which expert (by weight rank) was used
    """
    # Set plot size
    plot_size = 500  # Size in pixels for both width and height
    
    # Sort layers for x-axis
    layers = sorted(similarities.keys())
    sim_values = [similarities[layer] for layer in layers]
    
    # Create figure
    fig = go.Figure()
    
    # Add line plot
    fig.add_trace(go.Scatter(
        x=layers,
        y=sim_values,
        mode='lines+markers',
        name=f'Expert rank {k}',
        hovertemplate=
        "<b>Layer %{x}</b><br>" +
        "Cosine Similarity: %{y:.4f}<br>" +
        "<extra></extra>",  # Removes secondary box
        line=dict(width=2),
        marker=dict(size=8)
    ))
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=f'Cosine Similarity with Final Hidden State<br><sup>Using {k+1}th highest weighted expert per layer</sup>',
            x=0.5,  # Center title
            y=0.95
        ),
        xaxis=dict(
            title='Layer',
            gridcolor='rgba(128,128,128,0.2)',
            tickmode='linear',
            dtick=1,  # Show every layer number
            range=[0, 27.5]  # Set x-axis range from 0 to 27
        ),
        yaxis=dict(
            title='Cosine Similarity',
            gridcolor='rgba(128,128,128,0.2)',
            range=[-0.2, 1]  # Updated range for cosine similarity
        ),
        plot_bgcolor='white',
        hovermode='x unified',  # Shows all points at a given x-coordinate
        showlegend=False,
        width=plot_size,
        height=plot_size
    )
    
    # Add grid
    fig.update_xaxes(showgrid=True, gridwidth=1)
    fig.update_yaxes(showgrid=True, gridwidth=1)
    
    return fig

# Use the function
fig = plot_cosine_similarities(sim, k)
fig.show()

In [8]:


def analyze_multiple_samples(model, tokenizer, domain: str, samples: List[str], start_idx: int = 0):
    """Analyze multiple samples and return average cosine similarities per layer"""
    all_similarities = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Convert from 1-based to 0-based indexing
    zero_based_start = start_idx - 1 if start_idx > 0 else 0
    
    # Create output directories if they don't exist
    os.makedirs(f'cosine-sim/csv', exist_ok=True)
    
    for i, sample in enumerate(tqdm(samples[zero_based_start:], desc="Analyzing samples", initial=zero_based_start, total=len(samples))):
        try:
            # Clear CUDA cache before processing each sample
            if device.type == "cuda":
                torch.cuda.empty_cache()
                gc.collect()
            
            # Get prompt and tokenize
            input_ids = tokenizer(sample, return_tensors="pt").input_ids.to(device)
            n = input_ids.shape[1]
            token_pos = n-1  # Last token
            
            # Run analysis with mixed precision for CUDA
            if device.type == "cuda":
                with torch.cuda.amp.autocast():
                    analysis = analyze_deepseek_moe(model, tokenizer, sample, return_hidden_states=True, device=device.type)
                    post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=sample, device=device.type)
            else:
                analysis = analyze_deepseek_moe(model, tokenizer, sample, return_hidden_states=True)
                post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=sample)
            
            # Get final layer index and hidden state
            final_layer_idx = max([int(k) for k in analysis["layer_predictions"].keys()])
            final_hidden_state = analysis["hidden_states"][f"layer_{final_layer_idx}"][0][token_pos]
            
            # Get hidden states for highest weighted expert (k=0)
            hidden_states = get_expert_hidden_states_by_weight(
                analysis,
                post_attn_ln_inputs=post_attn_ln_inputs,
                token_pos=token_pos,
                k=0
            )
            
            # Calculate cosine similarities for this sample
            similarities = {}
            for layer_idx, layer_hidden in hidden_states.items():
                # Use mixed precision for CUDA
                if device.type == "cuda":
                    with torch.cuda.amp.autocast():
                        similarity = F.cosine_similarity(
                            final_hidden_state.unsqueeze(0),
                            layer_hidden.unsqueeze(0)
                        )
                else:
                    similarity = F.cosine_similarity(
                        final_hidden_state.unsqueeze(0),
                        layer_hidden.unsqueeze(0)
                    )
                similarities[layer_idx] = similarity.item()
                
            all_similarities.append(similarities)
            
            # Save individual sample results to CSV using 1-based indexing
            df = pd.DataFrame({
                'layer': list(similarities.keys()),
                'cosine_similarity': list(similarities.values())
            })
            df.to_csv(f'cosine-sim/csv/{domain}_{i+zero_based_start+1}_similarities.csv', index=False)
            
            # Clear variables to free memory
            del analysis, post_attn_ln_inputs, hidden_states, final_hidden_state
            if device.type == "cuda":
                torch.cuda.empty_cache()
                gc.collect()
                
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"\nOOM error on sample {i+zero_based_start+1}. Clearing cache and skipping sample...")
                if device.type == "cuda":
                    torch.cuda.empty_cache()
                    gc.collect()
                continue
            else:
                raise e
    
    # Calculate average similarities from individual CSV files
    avg_similarities = {}
    csv_files = glob.glob(f'cosine-sim/csv/{domain}_*_similarities.csv')
    
    # Read and combine all CSV files
    all_dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        all_dfs.append(df)
    
    if all_dfs:
        # Combine all dataframes and calculate mean per layer
        combined_df = pd.concat(all_dfs)
        avg_df = combined_df.groupby('layer')['cosine_similarity'].mean().reset_index()
        avg_df.columns = ['layer', f'average_cosine_similarity_{domain}']
        
        # Convert to dictionary format
        avg_similarities = dict(zip(avg_df['layer'], avg_df[f'average_cosine_similarity_{domain}']))
        
        # Save average similarities to CSV
        avg_df.to_csv(f'cosine-sim/csv/average_similarities_{domain}.csv', index=False)
    
    return avg_similarities, all_similarities

def process_samples(model, tokenizer, file_path: str="test.txt", start_from: int = 1):
    """
    Process samples from a text file starting from a 1-based index position.
    Args:
        model: The model to analyze
        tokenizer: The tokenizer to use
        file_path: Path to the text file containing samples
        start_from: 1-based index of the sample to start from (default: 1 for first sample)
    """
    # Set device to CUDA
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    if device.type == "cuda":
        print(f"CUDA Device: {torch.cuda.get_device_name()}")
        print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
        print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB")
    
    # Read text file
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Check if this is a GitHub code file
        if 'github.txt' in file_path:
            import re
            # Find all code blocks using the file pattern
            file_pattern = re.compile(r'.*\b\w+\.(js|py|c|cpp|java|ts|rb|go|rs|cs|swift|kt|php)$', re.MULTILINE)
            
            # Find all matches (file headers)
            matches = list(file_pattern.finditer(content))
            
            # Extract code blocks between file headers
            samples = []
            for i in range(len(matches)):
                start_pos = matches[i].start()
                # If this is the last match, go to the end of the file
                if i == len(matches) - 1:
                    end_pos = len(content)
                else:
                    end_pos = matches[i+1].start()
                
                # Extract the code block including the file header
                code_block = content[start_pos:end_pos].strip()
                samples.append(code_block)
        else:
            # Regular text file processing (one prompt per line)
            samples = [line.strip() for line in content.split('\n') if line.strip()]
        
        print(f"Loaded {len(samples)} samples from {file_path}")
            
    except FileNotFoundError:
        raise ValueError(f"Could not find file: {file_path}")
    
    # Validate start_from parameter (using 1-based indexing)
    if start_from < 1:
        raise ValueError("start_from must be at least 1 (1-based indexing)")
    if start_from > len(samples):
        raise ValueError(f"start_from ({start_from}) must be less than or equal to total samples ({len(samples)})")
    
    # Extract domain name from file path
    domain = os.path.splitext(os.path.basename(file_path))[0]
    
    # Make sure model is on the correct device
    model = model.to(device)
    
    # Run analysis
    avg_sims, all_sims = analyze_multiple_samples(model, tokenizer, domain, samples, start_from)
    
    # Print average similarities
    print("\nAverage Cosine Similarities per Layer:")
    for layer_idx in sorted(avg_sims.keys()):
        print(f"Layer {layer_idx}: {avg_sims[layer_idx]:.4f}")
        
    return avg_sims, all_sims

def plot_multi_sample_similarities(all_similarities, avg_similarities, domain):
    """
    Create an interactive plot showing average cosine similarities across layers
    """
    fig = go.Figure()
    
    # Read and plot average from CSV
    avg_file = f'cosine-sim/csv/average_similarities_{domain}.csv'
    if os.path.exists(avg_file):
        avg_df = pd.read_csv(avg_file)
        fig.add_trace(go.Scatter(
            x=avg_df['layer'],
            y=avg_df[f'average_cosine_similarity_{domain}'],
            mode='lines+markers',
            name='Average',
            line=dict(width=3, color='black'),
            marker=dict(size=8),
            hovertemplate="<b>Layer %{x}</b><br>Average Similarity: %{y:.4f}<extra></extra>"
        ))
    
    fig.update_layout(
        title=f'Average Cosine Similarities - {domain.capitalize()}',
        xaxis_title='Layer',
        yaxis_title='Cosine Similarity',
        yaxis_range=[-0.2, 1],
        showlegend=True,
        hovermode='closest'
    )
    
    return fig

In [None]:
# Initialize model and tokenizer as before
analyzer = DeepseekLayerAnalyzer(model, tokenizer)
analyzer.register_hooks()
domain = "test"

# Process samples
avg_sims, all_sims = process_samples(model,
                                    tokenizer,
                                    domain=domain,
                                    start_from=1) # 1-based indexing

In [None]:
fig = plot_multi_sample_similarities(all_sims, avg_sims)
fig.write_image(f"cosine-sim/cosine-sim-plot-{domain}.png")
fig.write_html(f"cosine-sim/cosine-sim-plot-{domain}.html")
fig.show()

---

### for compared k experts

In [9]:
def get_expert_hidden_state_by_rank(analysis: Dict, post_attn_ln_inputs: Dict, 
                                 token_pos: int, rank: int = 1, 
                                 include_residual: bool = True) -> Dict[int, torch.Tensor]:
    """
    Gets hidden state from the expert at specified rank (1 = highest weight, 6 = 6th highest weight)
    
    Args:
        analysis: Analysis results from DeepseekLayerAnalyzer
        post_attn_ln_inputs: Inputs to post attention layer norm
        token_pos: Position of token to analyze
        rank: Which expert to select (1 = highest weight, 2 = second highest, etc.)
        include_residual: Whether to include residual connection
    """
    hidden_states_by_layer = {}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # For each layer that has MoE
    for layer_idx in analysis['moe_analysis'].keys():
        moe_analysis = analysis['moe_analysis'][layer_idx]
        
        # Get expert weights for this token
        expert_weights = {}
        selected_experts = moe_analysis['selected_experts'][token_pos]
        expert_weights_list = moe_analysis['expert_weights'][token_pos]
        
        # Map experts to their weights
        for expert_idx, weight in zip(selected_experts, expert_weights_list):
            expert_weights[expert_idx] = weight
            
        # Sort by weight and get the expert at specified rank
        sorted_experts = sorted(expert_weights.items(), key=lambda x: x[1], reverse=True)
        if rank > len(sorted_experts):
            print(f"Warning: Requested rank {rank} but only {len(sorted_experts)} experts available")
            continue
            
        expert_idx, weight = sorted_experts[rank - 1]  # rank-1 because rank is 1-based
        
        # Get hidden state for this expert
        expert_hidden_state = moe_analysis['expert_hidden_states_by_position'][token_pos][expert_idx]['hidden_state']
        
        # Convert to tensor if needed and move to correct device
        if isinstance(expert_hidden_state, list):
            expert_hidden_state = torch.tensor(expert_hidden_state, device=device, dtype=torch.float16)
        elif isinstance(expert_hidden_state, torch.Tensor):
            expert_hidden_state = expert_hidden_state.to(device=device, dtype=torch.float16)
            
        # Add residual if requested
        if include_residual:
            try:
                residual = post_attn_ln_inputs[layer_idx][-1][0][0][token_pos]
                residual = residual.to(device=device, dtype=torch.float16)
                expert_hidden_state = expert_hidden_state + residual
            except (KeyError, IndexError) as e:
                print(f"Warning: Could not get residual for layer {layer_idx}, skipping. Error: {e}")
                continue
                
        hidden_states_by_layer[layer_idx] = expert_hidden_state
        
    return hidden_states_by_layer

def compare_expert_ranks(analysis: Dict, post_attn_ln_inputs: Dict,
                        token_pos: int, final_hidden_state: torch.Tensor,
                        rank1: int = 1, rank2: int = 6,
                        include_residual: bool = True) -> Dict[int, Tuple[float, float, float]]:
    """
    Compare cosine similarities between two experts at different ranks and the final hidden state.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    final_hidden_state = final_hidden_state.to(device=device, dtype=torch.float16)
    
    # Get hidden states for both experts
    hidden_states_rank1 = get_expert_hidden_state_by_rank(analysis, post_attn_ln_inputs, token_pos, rank1, include_residual=include_residual)
    hidden_states_rank2 = get_expert_hidden_state_by_rank(analysis, post_attn_ln_inputs, token_pos, rank2, include_residual=include_residual)
    
    similarities = {}
    
    for layer_idx in hidden_states_rank1.keys():
        if layer_idx not in hidden_states_rank2:
            continue
            
        # Get hidden states for this layer
        h1 = hidden_states_rank1[layer_idx]
        h2 = hidden_states_rank2[layer_idx]
        
        # Calculate similarities
        sim_rank1_final = F.cosine_similarity(h1.unsqueeze(0), final_hidden_state.unsqueeze(0)).item()
        sim_rank2_final = F.cosine_similarity(h2.unsqueeze(0), final_hidden_state.unsqueeze(0)).item()
        sim_rank1_rank2 = F.cosine_similarity(h1.unsqueeze(0), h2.unsqueeze(0)).item()
        
        similarities[layer_idx] = (sim_rank1_final, sim_rank2_final, sim_rank1_rank2)
        
    return similarities

def plot_expert_rank_comparisons(similarities: Dict[int, Tuple[float, float, float]], 
                               rank1: int = 1, rank2: int = 6,
                               domain: str = "test"):
    """Create interactive plot comparing expert ranks"""
    # Sort layers for x-axis
    layers = sorted(similarities.keys())
    sim_rank1_final = [similarities[l][0] for l in layers]
    sim_rank2_final = [similarities[l][1] for l in layers]
    sim_rank1_rank2 = [similarities[l][2] for l in layers]
    
    # Create figure
    fig = go.Figure()
    
    # Add traces
    fig.add_trace(go.Scatter(
        x=layers,
        y=sim_rank1_final,
        mode='lines+markers',
        name=f'Rank {rank1} to Final',
        line=dict(width=2),
        marker=dict(size=8),
        hovertemplate="<b>Layer %{x}</b><br>" +
                     f"Rank {rank1} to Final: %{{y:.4f}}<extra></extra>"
    ))
    
    fig.add_trace(go.Scatter(
        x=layers,
        y=sim_rank2_final,
        mode='lines+markers',
        name=f'Rank {rank2} to Final',
        line=dict(width=2),
        marker=dict(size=8),
        hovertemplate="<b>Layer %{x}</b><br>" +
                     f"Rank {rank2} to Final: %{{y:.4f}}<extra></extra>"
    ))
    
    fig.add_trace(go.Scatter(
        x=layers,
        y=sim_rank1_rank2,
        mode='lines+markers',
        name=f'Rank {rank1} to Rank {rank2}',
        line=dict(width=2),
        marker=dict(size=8),
        hovertemplate="<b>Layer %{x}</b><br>" +
                     f"Rank {rank1} to Rank {rank2}: %{{y:.4f}}<extra></extra>"
    ))
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=f'Expert Rank Comparisons - {domain.capitalize()}',
            x=0.5,
            y=0.95
        ),
        xaxis=dict(
            title='Layer',
            gridcolor='rgba(128,128,128,0.2)',
            tickmode='linear',
            dtick=1,
            range=[0, 27.5]
        ),
        yaxis=dict(
            title='Cosine Similarity',
            gridcolor='rgba(128,128,128,0.2)',
            range=[-0.2, 1]
        ),
        plot_bgcolor='white',
        hovermode='x unified',
        showlegend=True,
        width=500,
        height=500
    )
    
    # Add grid
    fig.update_xaxes(showgrid=True, gridwidth=1)
    fig.update_yaxes(showgrid=True, gridwidth=1)
    
    return fig


def analyze_expert_ranks_and_plot(model, tokenizer, prompt: str, 
                                rank1: int = 1, rank2: int = 6,
                                domain: str = "test",
                                include_residual=True,
                                device="cuda"):
    """Wrapper function to perform complete expert rank analysis and plotting"""
    # Set device
    device = torch.device(device if torch.cuda.is_available() and device=="cuda" else "cpu")
    
    # Get inputs ready
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    token_pos = input_ids.shape[1] - 1  # Last token
    
    # Run analysis with mixed precision if on CUDA
    if device.type == "cuda":
        with torch.cuda.amp.autocast():
            analysis = analyze_deepseek_moe(model, tokenizer, prompt, return_hidden_states=True, device=device.type)
            post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=prompt, device=device.type)
    else:
        analysis = analyze_deepseek_moe(model, tokenizer, prompt, return_hidden_states=True)
        post_attn_ln_inputs = get_post_attn_ln_inputs(model, tokenizer, text=prompt)
    
    # Get final hidden state
    final_layer_idx = max([int(k) for k in analysis["layer_predictions"].keys()])
    final_hidden_state = analysis["hidden_states"][f"layer_{final_layer_idx}"][0][token_pos]
    
    # Compare expert ranks
    similarities = compare_expert_ranks(
        analysis=analysis,
        post_attn_ln_inputs=post_attn_ln_inputs,
        token_pos=token_pos,
        final_hidden_state=final_hidden_state,
        rank1=rank1,
        rank2=rank2,
        include_residual=include_residual
    )
    
    # Create output directories if they don't exist
    os.makedirs(f'cosine-sim-avg', exist_ok=True)
    
    # Create and save visualization
    fig = plot_expert_rank_comparisons(similarities, rank1=rank1, rank2=rank2, domain=domain)
    
    # Save plots
    fig.write_image(f"cosine-sim-avg/expert-rank-comparison-{domain}.png")
    fig.write_html(f"cosine-sim-avg/expert-rank-comparison-{domain}.html")
    
    return similarities, fig

In [10]:
async def process_samples_with_expert_ranks(model, tokenizer, file_path: str = "test.txt", start_from: int = 1):
    """
    Process multiple samples from a text file and analyze expert ranks
    Args:
        model: The model to analyze
        tokenizer: The tokenizer to use
        file_path: Path to the text file containing samples
        start_from: 1-based index of where to start processing samples
    """
    # Set device to CUDA
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    if device.type == "cuda":
        print(f"CUDA Device: {torch.cuda.get_device_name()}")
        print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
        print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB")
        
        # Set higher precision for better performance
        torch.set_float32_matmul_precision('high')
    
    # Read text file
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Check if this is a GitHub code file
        if 'github.txt' in file_path:
            import re
            # Find all code blocks using the file pattern
            file_pattern = re.compile(r'.*\b\w+\.(js|py|c|cpp|java|ts|rb|go|rs|cs|swift|kt|php)$', re.MULTILINE)
            
            # Find all matches (file headers)
            matches = list(file_pattern.finditer(content))
            
            # Extract code blocks between file headers
            samples = []
            for i in range(len(matches)):
                start_pos = matches[i].start()
                # If this is the last match, go to the end of the file
                if i == len(matches) - 1:
                    end_pos = len(content)
                else:
                    end_pos = matches[i+1].start()
                
                # Extract the code block including the file header
                code_block = content[start_pos:end_pos].strip()
                samples.append(code_block)
        else:
            # Regular text file processing (one prompt per line)
            samples = [line.strip() for line in content.split('\n') if line.strip()]
        
        print(f"Loaded {len(samples)} samples from {file_path}")
            
    except FileNotFoundError:
        raise ValueError(f"Could not find file: {file_path}")
    
    # Extract domain name from file path
    domain = os.path.splitext(os.path.basename(file_path))[0]
    
    # Make sure model is on the correct device
    model = model.to(device)
    
    # Convert from 1-based to 0-based indexing
    zero_based_start = start_from - 1 if start_from > 0 else 0
    
    # Store all similarities
    all_similarities = []
    
    # Create output directories if they don't exist
    os.makedirs(f'cosine-sim-avg/csv', exist_ok=True)
    os.makedirs(f'cosine-sim-avg', exist_ok=True)
    
    for i, sample in enumerate(tqdm(samples[zero_based_start:], 
                                  desc="Analyzing samples", 
                                  initial=zero_based_start, 
                                  total=len(samples))):
        try:
            # Clear CUDA cache before processing each sample
            if device.type == "cuda":
                torch.cuda.empty_cache()
                gc.collect()
            
            # Process single sample with mixed precision if on CUDA
            if device.type == "cuda":
                with torch.cuda.amp.autocast():
                    similarities, _ = analyze_expert_ranks_and_plot(
                        model,
                        tokenizer,
                        prompt=sample,
                        rank1=1,
                        rank2=6,
                        domain=f"{domain}_sample_{i+zero_based_start+1}",
                        include_residual=True,
                        device=device.type
                    )
            else:
                similarities, _ = analyze_expert_ranks_and_plot(
                    model,
                    tokenizer,
                    prompt=sample,
                    rank1=1,
                    rank2=6,
                    domain=f"{domain}_sample_{i+zero_based_start+1}"
                )
            
            # Store results
            all_similarities.append(similarities)
            
            # Save individual sample results to CSV
            sample_df = pd.DataFrame({
                'layer': list(similarities.keys()),
                'rank1_to_final': [s[0] for s in similarities.values()],
                'rank6_to_final': [s[1] for s in similarities.values()],
                'rank1_to_rank6': [s[2] for s in similarities.values()]
            })
            sample_df.to_csv(f'cosine-sim-avg/csv/{domain}_expert_ranks_{i+zero_based_start+1}.csv', 
                           index=False)
            
            # Print memory stats periodically for monitoring
            if device.type == "cuda" and i % 5 == 0:
                print(f"\nCUDA Memory After Sample {i+zero_based_start+1}: "
                      f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB, "
                      f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"\nOOM error on sample {i+zero_based_start+1}. Clearing cache and skipping...")
                if device.type == "cuda":
                    torch.cuda.empty_cache()
                    gc.collect()
                continue
            else:
                raise e
    
    # Calculate average similarities
    # First, create a dictionary to store sums and counts for each layer
    sums = defaultdict(lambda: [0.0, 0.0, 0.0])  # [sum_rank1_final, sum_rank6_final, sum_rank1_rank6]
    counts = defaultdict(int)
    
    # Sum up all values
    for sim_dict in all_similarities:
        for layer, (r1f, r6f, r1r6) in sim_dict.items():
            sums[layer][0] += r1f
            sums[layer][1] += r6f
            sums[layer][2] += r1r6
            counts[layer] += 1
    
    # Calculate averages
    avg_similarities = {}
    for layer in sums:
        avg_similarities[layer] = tuple(s / counts[layer] for s in sums[layer])
    
    # Also save raw averages
    raw_avg_df = pd.DataFrame({
        'layer': list(avg_similarities.keys()),
        'rank1_to_final': [s[0] for s in avg_similarities.values()],
        'rank6_to_final': [s[1] for s in avg_similarities.values()],
        'rank1_to_rank6': [s[2] for s in avg_similarities.values()]
    })
    raw_avg_df.to_csv(f'cosine-sim-avg/csv/{domain}_expert_ranks_avg.csv', index=False)
    
    # Create and save average plot
    fig = plot_expert_rank_comparisons(avg_similarities, rank1=1, rank2=6, domain=f"{domain}_average")
    fig.write_image(f"cosine-sim-avg/expert-rank-comparison-{domain}_average.png")
    fig.write_html(f"cosine-sim-avg/expert-rank-comparison-{domain}_average.html")
    
    return avg_similarities, all_similarities, fig

In [None]:
# Example usage for a single prompt
prompt = "One might expect language modeling performance to depend on model architecture, the size of neural models, the computing power used to train them, and the data available for this"
domain = "test"
similarities, fig = analyze_expert_ranks_and_plot(
    model, 
    tokenizer, 
    prompt=prompt,
    rank1=1,  # Highest weighted expert
    rank2=6,  # 6th highest weighted expert
    domain=domain,
    include_residual=True
)
fig.show()
fig.write_image(f"expert-rank-comparison-{domain}-2.png")
fig.write_html(f"expert-rank-comparison-{domain}-2.html")


In [None]:
# Initialize analyzer as before
analyzer = DeepseekLayerAnalyzer(model, tokenizer)
analyzer.register_hooks()

# Process samples from a text file
file_path = "english.txt"  # or "code.txt", etc.
avg_sims, all_sims, fig = await process_samples_with_expert_ranks(
    model,
    tokenizer,
    file_path=file_path,
    start_from=1  # Start from first sample
)

# Display the average plot
fig.show()