In [1]:
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ["PYTORCH_TRANSFORMERS_SDP_BACKEND"] = "flash"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import pandas as pd
from collections import defaultdict
import plotly.graph_objects as go

In [None]:
def load_model(model_name="allenai/OLMoE-1B-7B-0924"):
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

model, tokenizer = load_model()

In [3]:
def print_expert_weights(model, layer_idx, expert_idx):
    """
    Print the weights of a specific expert MLP at a given layer.
    
    Args:
        model: The OLMoE model
        layer_idx: Index of the layer containing the expert
        expert_idx: Index of the expert within the layer
    """
    gate_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj.weight'
    up_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj.weight'
    down_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj.weight'
    
    print(f"\nGate Projection for expert {expert_idx} in layer {layer_idx}:")
    print(model.state_dict()[gate_proj])
    print(f"\nUp Projection for expert {expert_idx} in layer {layer_idx}:") 
    print(model.state_dict()[up_proj])
    print(f"\nDown Projection for expert {expert_idx} in layer {layer_idx}:")
    print(model.state_dict()[down_proj])

### printing weights for experts before swapping

In [None]:
# print_expert_weights(model, layer_idx=0, expert_idx=29)

# print("\n" + "="*80 + "\n")  # Separator for readability

# print_expert_weights(model, layer_idx=0, expert_idx=42)


### vanilla inference

In [5]:
# # Test the model with a prompt on vanilla model
# prompt = (""" 
# Continue the poem naturally and coherently, maintaining consistency with the rhyme scheme, diction and imagery. Match the poem's tone and style precisely.

# we measure rainfall in memories now
# count droplets like endangered species
# my grandmother's garden is underwater
# but the roses still bloom, phosphorescent
# in depths where submarines chart
# the coordinates of lost cities, while above                  
# """)

# # Convert the prompt to inputs and run a forward pass
# inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
# # Generate output (since it's a causal LM, we need to generate text from input)
# outputs = model.generate(
#     inputs['input_ids'],  # Only provide input_ids to generate
#     attention_mask=inputs['attention_mask'],  # Add attention mask to not attend to padding tokens
#     max_new_tokens=156,    # Generate 1024 new tokens
#     temperature=0.6,       # Control randomness
#     # top_k=100,  # Use top-k sampling
#     do_sample=True,        # Use sampling instead of greedy decoding
#     eos_token_id=tokenizer.eos_token_id,
#     pad_token_id=tokenizer.eos_token_id  # Set padding token
# )

# # Decode the generated output
# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Print the original prompt and generated response
# print("Prompt:", prompt)
# print("\nGenerated response :", generated_text)

### swap the experts of source layer with target layer

In [7]:
def swap_experts(model, expert_idx, target_layer_idx, source_layer_idx=0, source_expert_idx=0):
    """
    Swap experts between two layers in the OLMoE model.
    
    Args :
        model: The OLMoE model
        expert_idx: Index of the expert in target layer to swap with
        target_layer_idx: Index of the layer containing the expert to swap with
        source_layer_idx: Index of the source layer (default 0)
        source_expert_idx: Index of the source expert (default 0)

    """
    # Access the decoder layers
    decoder_layers = model.model.layers
    
    # Verify indices are valid
    num_layers = len(decoder_layers)
    if target_layer_idx >= num_layers or source_layer_idx >= num_layers:
        raise ValueError(f"Layer index out of range. Model has {num_layers} layers.")
    
    # Get the MoE blocks from both layers
    source_moe = decoder_layers[source_layer_idx].mlp
    target_moe = decoder_layers[target_layer_idx].mlp
    
    # Verify expert indices are valid
    num_experts = len(source_moe.experts)
    if expert_idx >= num_experts or source_expert_idx >= num_experts:
        raise ValueError(f"Expert index out of range. Each layer has {num_experts} experts.")
        
    # Swap the expert weights
    source_expert = source_moe.experts[source_expert_idx]
    target_expert = target_moe.experts[expert_idx]
    
    # Swap gate projection weights
    source_expert.gate_proj.weight, target_expert.gate_proj.weight = \
        target_expert.gate_proj.weight, source_expert.gate_proj.weight
        
    # Swap up projection weights
    source_expert.up_proj.weight, target_expert.up_proj.weight = \
        target_expert.up_proj.weight, source_expert.up_proj.weight
        
    # Swap down projection weights  
    source_expert.down_proj.weight, target_expert.down_proj.weight = \
        target_expert.down_proj.weight, source_expert.down_proj.weight
    
    return {
        'swapped_experts': {
            'source': {
                'layer': source_layer_idx,
                'expert': source_expert_idx
            },
            'target': {
                'layer': target_layer_idx,
                'expert': expert_idx
            }
        }
    }

In [8]:
# lists of experts to swap
source_experts = [29,42]  
target_experts = [1,12]  

source_layer_idx = 0  
target_layer_idx = 2  

# Swap experts at each index
for i in range(len(source_experts)):
    swap_experts(model, expert_idx=target_experts[i], target_layer_idx=target_layer_idx, source_layer_idx=source_layer_idx, source_expert_idx=source_experts[i])
    print(f"Swapped experts at layer {source_layer_idx}, expert {source_experts[i]} with expert {target_experts[i]} in layer {target_layer_idx}")



torch.Size([1024, 2048])
Swapped experts at layer 0, expert 29 with expert 1 in layer 2
torch.Size([1024, 2048])
Swapped experts at layer 0, expert 42 with expert 12 in layer 2


### printing weights for experts after swapping

In [None]:
# print_expert_weights(model, layer_idx=0, expert_idx=29)

# print("\n" + "="*80 + "\n")  # Separator for readability

# print_expert_weights(model, layer_idx=2, expert_idx=1)


In [None]:
# Test the model with a prompt
prompt = ("""    
Continue this text in a natural and coherent way, maintaining consistency with the style, 
terminology, and logical flow of the preceding text.
          
\\title{Quantum Error Mitigation in NISQ Devices}
\\begin{abstract}
We present a novel approach to error mitigation in noisy intermediate-scale quantum (NISQ) devices. 
Our method introduces a scaling framework for quantum channels that preserves gate fidelity while reducing environmental noise.
\end{abstract}
\section{Introduction}
Recent advances in NISQ devices have demonstrated both promise and limitations in quantum computation. 
The primary challenge remains decoherence, which introduces errors in quantum operations. We propose a channel scaling approach 
$\mathcal{N}(\\rho) = e^{-\lambda t}\\rho$ 
that provides a systematic way to
"""
)

# Convert the prompt to inputs and run a forward pass
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
# Generate output (since it's a causal LM, we need to generate text from input)
outputs = model.generate(
    inputs['input_ids'],  # Only provide input_ids to generate
    attention_mask=inputs['attention_mask'],  # Add attention mask to not attend to padding tokens
    max_new_tokens=156,    # Generate 100 new tokens
    temperature=0.6,       # Control randomness
    do_sample=True,        # Use sampling instead of greedy decoding
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id  # Set padding token
)

# Decode the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the original prompt and generated response
print("Prompt :", prompt)
print("\nGenerated response :", generated_text)

#### top 8 and bottom 8 experts for each layer for `inputs.txt` for `OLMoE-1B-7B-0924`
Layer `0`
First 8 keys: `[0, 6, 36, 41, 52, 10, 49, 21]`
Last 8 keys: `[43, 20, 63, 7, 58, 34, 39, 9]`

Layer `1`
First 8 keys: `[47, 18, 61, 25, 5, 16, 27, 7]`
Last 8 keys: `[3, 6, 33, 52, 56, 10, 43, 57]`

Layer `2`
First 8 keys: `[60, 10, 61, 45, 40, 30, 15, 26]`
Last 8 keys: `[53, 4, 55, 41, 2, 8, 51, 59]`

Layer `3`
First 8 keys: `[35, 9, 6, 62, 15, 51, 19, 43]`
Last 8 keys: `[1, 44, 24, 25, 60, 26, 16, 17]`

Layer `4`
First 8 keys: `[17, 21, 6, 27, 2, 14, 25, 55]`
Last 8 keys: `[36, 13, 54, 12, 57, 18, 15, 32]`

Layer `5`
First 8 keys: `[0, 60, 31, 57, 37, 17, 21, 2]`
Last 8 keys: `[32, 47, 51, 24, 49, 63, 15, 26]`

Layer `6`
First 8 keys: `[57, 62, 18, 36, 52, 40, 4, 31]`
Last 8 keys: `[25, 21, 45, 34, 14, 39, 11, 56]`

Layer `7`
First 8 keys: `[17, 58, 35, 2, 4, 59, 21, 61]`
Last 8 keys: `[43, 47, 41, 27, 53, 6, 51, 8]`

Layer `8`
First 8 keys: `[16, 54, 14, 18, 32, 3, 37, 6]`
Last 8 keys: `[9, 25, 33, 4, 47, 62, 19, 11]`

Layer `9`
First 8 keys: `[5, 8, 6, 4, 28, 14, 7, 20]`
Last 8 keys: `[61, 10, 0, 60, 35, 58, 56, 37]`

Layer `10`
First 8 keys: `[56, 43, 11, 59, 22, 60, 13, 28]`
Last 8 keys: `[53, 63, 57, 33, 1, 31, 10, 54]`

Layer `11`
First 8 keys: `[47, 23, 27, 51, 54, 33, 52, 25]`
Last 8 keys: `[55, 29, 49, 14, 53, 19, 17, 59]`

Layer `12`
First 8 keys: `[43, 55, 59, 38, 31, 58, 47, 44]`
Last 8 keys: `[18, 19, 27, 61, 45, 3, 37, 0]`

Layer `13`
First 8 keys: `[2, 32, 5, 20, 25, 22, 55, 61]`
Last 8 keys: `[56, 17, 40, 1, 48, 52, 21, 36]`

Layer `14`
First 8 keys: `[9, 58, 6, 4, 24, 52, 11, 17]`
Last 8 keys: `[41, 3, 26, 13, 43, 25, 27, 55]`

Layer `15`
First 8 keys: `[17, 1, 34, 44, 50, 45, 30, 54]`
Last 8 keys: `[18, 15, 37, 26, 7, 38, 21, 51]`



#### sorting expert dicts

In [None]:
def get_top_and_bottom_keys(input_dict, n=8):
    # Sorting by values in decreasing order
    sorted_items = sorted(input_dict.items(), key=lambda item: item[1], reverse=True)
    # Extract keys from the sorted items
    sorted_keys = [item[0] for item in sorted_items]
    # Get the first n keys and the last n keys
    first_n_keys = sorted_keys[:n]
    last_n_keys = sorted_keys[-n:]

    return first_n_keys, last_n_keys

# Example usage with layer 3 dict
example_dict =  {27: 151, 19: 19, 55: 86, 6: 162, 25: 105, 41: 50, 48: 26, 17: 753, 3: 44, 21: 275, 7: 63, 61: 23, 39: 31, 59: 16, 2: 141, 43: 14, 10: 11, 62: 14, 33: 32, 11: 16, 52: 34, 14: 106, 35: 35, 5: 44, 60: 27, 24: 23, 58: 18, 63: 9, 51: 38, 8: 23, 4: 10, 16: 11, 23: 12, 13: 6, 22: 14, 38: 21, 34: 26, 26: 28, 29: 9, 50: 20, 46: 16, 9: 21, 37: 22, 28: 9, 53: 48, 1: 30, 20: 21, 45: 36, 57: 3, 42: 45, 56: 12, 44: 13, 30: 24, 40: 12, 0: 21, 18: 3, 54: 6, 12: 5, 36: 7, 15: 2, 32: 1}

first_8_keys, last_8_keys = get_top_and_bottom_keys(example_dict)
print("First 8 keys:", first_8_keys)
print("Last 8 keys:", last_8_keys)

### zero out experts

In [None]:
def zero_expert(model, layer_idx, expert_idx):
    """
    Zero out a specific expert in a specific layer of the OLMoE model.
    
    Args:
        model: The OLMoE model
        layer_idx: Index of the layer containing the expert
        expert_idx: Index of the expert to zero out
    """
    # Access the decoder layers
    decoder_layers = model.model.layers
    
    # Verify indices are valid
    num_layers = len(decoder_layers)
    if layer_idx >= num_layers:
        raise ValueError(f"Layer index out of range. Model has {num_layers} layers.")
    
    # Get the MoE block
    moe = decoder_layers[layer_idx].mlp
    
    # Verify expert index is valid
    num_experts = len(moe.experts)
    if expert_idx >= num_experts:
        raise ValueError(f"Expert index out of range. Layer has {num_experts} experts.")
        
    # Get the expert
    expert = moe.experts[expert_idx]
    
    # Zero out all weights in the expert
    expert.gate_proj.weight.data.zero_()
    expert.up_proj.weight.data.zero_()
    expert.down_proj.weight.data.zero_()
    
    return {
        'zeroed_expert': {
            'layer': layer_idx,
            'expert': expert_idx
        }
    }


In [None]:
def zero_multiple_experts(model, expert_indices_per_layer):
    """
    Zero out multiple experts across different layers.
    
    Args:
        model: The OLMoE model
        expert_indices_per_layer: Dict mapping layer indices to lists of expert indices to zero
                                e.g. {0: [0,1], 1: [2,3]} zeros experts 0,1 in layer 0 and 2,3 in layer 1
    """
    results = []
    
    for layer_idx, expert_indices in expert_indices_per_layer.items():
        for expert_idx in expert_indices:
            result = zero_expert(model, layer_idx, expert_idx)
            results.append(result)
            print(f"Zeroed out expert {expert_idx} in layer {layer_idx}")
            
    return results
