In [1]:
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ["PYTORCH_TRANSFORMERS_SDP_BACKEND"] = "flash"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import json
import pandas as pd
from collections import defaultdict
import plotly.graph_objects as go

In [2]:
def load_model(model_name="allenai/OLMoE-1B-7B-0924"):
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
def print_expert_weights(model, layer_idx, expert_idx):
    """
    Print the weights of a specific expert MLP at a given layer.
    
    Args:
        model: The OLMoE model
        layer_idx: Index of the layer containing the expert
        expert_idx: Index of the expert within the layer
    """
    gate_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj.weight'
    up_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj.weight'
    down_proj = f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj.weight'
    
    print("\nGate Projection:")
    print(model.state_dict()[gate_proj])
    print("\nUp Projection:") 
    print(model.state_dict()[up_proj])
    print("\nDown Projection:")
    print(model.state_dict()[down_proj])


In [4]:
# Print weights for layer 0 expert 0
print_expert_weights(model, layer_idx=0, expert_idx=0)

print("\n" + "="*80 + "\n")  # Separator for readability

# Print weights for layer 0 expert 1
print_expert_weights(model, layer_idx=0, expert_idx=1)



Gate Projection:
tensor([[-0.0118,  0.0150,  0.0190,  ..., -0.0022, -0.0043,  0.0098],
        [-0.0156,  0.0110,  0.0079,  ..., -0.0136, -0.0059,  0.0064],
        [-0.0004, -0.0051, -0.0077,  ...,  0.0092,  0.0153, -0.0466],
        ...,
        [-0.0208, -0.0040,  0.0033,  ...,  0.0048, -0.0037,  0.0115],
        [-0.0066,  0.0152,  0.0057,  ..., -0.0050, -0.0275, -0.0078],
        [ 0.0004, -0.0062,  0.0060,  ...,  0.0011, -0.0302,  0.0057]])

Up Projection:
tensor([[ 0.0038, -0.0003,  0.0136,  ..., -0.0016,  0.0109, -0.0172],
        [ 0.0078, -0.0044,  0.0027,  ...,  0.0194,  0.0127,  0.0271],
        [ 0.0020,  0.0182,  0.0090,  ...,  0.0281, -0.0231, -0.0129],
        ...,
        [ 0.0015,  0.0172,  0.0036,  ...,  0.0123, -0.0040, -0.0101],
        [-0.0208,  0.0119,  0.0157,  ..., -0.0388,  0.0206, -0.0027],
        [-0.0113,  0.0016, -0.0238,  ..., -0.0243, -0.0132,  0.0069]])

Down Projection:
tensor([[-0.0053,  0.0125, -0.0048,  ...,  0.0077, -0.0144, -0.0030],
        [-

In [5]:
# # Test the model with a prompt
# prompt = (""" 
# Continue the poem naturally and coherently, maintaining consistency with the rhyme scheme, diction and imagery. Match the poem's tone and style precisely.

# we measure rainfall in memories now
# count droplets like endangered species
# my grandmother's garden is underwater
# but the roses still bloom, phosphorescent
# in depths where submarines chart
# the coordinates of lost cities, while above                  
# """)

# # Convert the prompt to inputs and run a forward pass
# inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
# # Generate output (since it's a causal LM, we need to generate text from input)
# outputs = model.generate(
#     inputs['input_ids'],  # Only provide input_ids to generate
#     attention_mask=inputs['attention_mask'],  # Add attention mask to not attend to padding tokens
#     max_new_tokens=156,    # Generate 1024 new tokens
#     temperature=0.6,       # Control randomness
#     # top_k=100,  # Use top-k sampling
#     do_sample=True,        # Use sampling instead of greedy decoding
#     eos_token_id=tokenizer.eos_token_id,
#     pad_token_id=tokenizer.eos_token_id  # Set padding token
# )

# # Decode the generated output
# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Print the original prompt and generated response
# print("Prompt:", prompt)
# print("\nGenerated response :", generated_text)

In [6]:
def swap_experts(model, expert_idx, target_layer_idx, source_layer_idx=0, source_expert_idx=0):
    """
    Swap experts between two layers in the OLMoE model.
    
    Args:
        model: The OLMoE model
        expert_idx: Index of the expert in target layer to swap with
        target_layer_idx: Index of the layer containing the expert to swap with
        source_layer_idx: Index of the source layer (default 0)
        source_expert_idx: Index of the source expert (default 0)

    """
    # Access the decoder layers
    decoder_layers = model.model.layers
    print(decoder_layers[0].mlp.experts[0].gate_proj.weight.shape)
    
    # Verify indices are valid
    num_layers = len(decoder_layers)
    if target_layer_idx >= num_layers or source_layer_idx >= num_layers:
        raise ValueError(f"Layer index out of range. Model has {num_layers} layers.")
    
    # Get the MoE blocks from both layers
    source_moe = decoder_layers[source_layer_idx].mlp
    target_moe = decoder_layers[target_layer_idx].mlp
    
    # Verify expert indices are valid
    num_experts = len(source_moe.experts)
    if expert_idx >= num_experts or source_expert_idx >= num_experts:
        raise ValueError(f"Expert index out of range. Each layer has {num_experts} experts.")
        
    # Swap the expert weights
    source_expert = source_moe.experts[source_expert_idx]
    target_expert = target_moe.experts[expert_idx]
    
    # Swap gate projection weights
    source_expert.gate_proj.weight, target_expert.gate_proj.weight = \
        target_expert.gate_proj.weight, source_expert.gate_proj.weight
        
    # Swap up projection weights
    source_expert.up_proj.weight, target_expert.up_proj.weight = \
        target_expert.up_proj.weight, source_expert.up_proj.weight
        
    # Swap down projection weights  
    source_expert.down_proj.weight, target_expert.down_proj.weight = \
        target_expert.down_proj.weight, source_expert.down_proj.weight
    
    return {
        'swapped_experts': {
            'source': {
                'layer': source_layer_idx,
                'expert': source_expert_idx
            },
            'target': {
                'layer': target_layer_idx,
                'expert': expert_idx
            }
        }
    }

#### gemini logic

In [7]:
# def swap_experts(model, expert_idx, target_layer_idx, source_layer_idx=0, source_expert_idx=0):
#     """
#     Swap experts between two layers in the OLMoE model.

#     Args:
#         model: The OLMoE model
#         expert_idx: Index of the expert in target layer to swap with
#         target_layer_idx: Index of the layer containing the expert to swap with
#         source_layer_idx: Index of the source layer (default 0)
#         source_expert_idx: Index of the source expert (default 0)
#     """
#     # Access the decoder layers
#     decoder_layers = model.model.layers
#     print(decoder_layers[0].mlp.experts[0].gate_proj.weight.shape)

#     # Verify indices are valid
#     num_layers = len(decoder_layers)
#     if target_layer_idx >= num_layers or source_layer_idx >= num_layers:
#         raise ValueError(f"Layer index out of range. Model has {num_layers} layers.")

#     # Get the MoE blocks from both layers
#     source_moe = decoder_layers[source_layer_idx].mlp
#     target_moe = decoder_layers[target_layer_idx].mlp

#     # Verify expert indices are valid
#     num_experts = len(source_moe.experts)
#     if expert_idx >= num_experts or source_expert_idx >= num_experts:
#         raise ValueError(f"Expert index out of range. Each layer has {num_experts} experts.")

#     # Swap the expert weights
#     source_expert = source_moe.experts[source_expert_idx]
#     target_expert = target_moe.experts[expert_idx]

#     # Swap gate projection weights
#     temp_gate_proj_weight = source_expert.gate_proj.weight.data.clone()
#     source_expert.gate_proj.weight.data = target_expert.gate_proj.weight.data.clone()
#     target_expert.gate_proj.weight.data = temp_gate_proj_weight

#     # Swap up projection weights
#     temp_up_proj_weight = source_expert.up_proj.weight.data.clone()
#     source_expert.up_proj.weight.data = target_expert.up_proj.weight.data.clone()
#     target_expert.up_proj.weight.data = temp_up_proj_weight

#     # Swap down projection weights
#     temp_down_proj_weight = source_expert.down_proj.weight.data.clone()
#     source_expert.down_proj.weight.data = target_expert.down_proj.weight.data.clone()
#     target_expert.down_proj.weight.data = temp_down_proj_weight
    
#     return {
#         'swapped_experts': {
#             'source': {
#                 'layer': source_layer_idx,
#                 'expert': source_expert_idx
#             },
#             'target': {
#                 'layer': target_layer_idx,
#                 'expert': expert_idx
#             }
#         }
#     }

In [8]:
# Create lists of experts to swap
top_experts_list = [0] # layer 15
bottom_experts_list = [1] # layer 15

layer_idx = 0
# Swap experts at each index
for i in range(len(top_experts_list)):
    swap_experts(model, expert_idx=top_experts_list[i], target_layer_idx=layer_idx, source_layer_idx=layer_idx, source_expert_idx=bottom_experts_list[i])
    print(f"Swapped experts at layer {layer_idx}, top expert {top_experts_list[i]} with bottom expert {bottom_experts_list[i]}")



torch.Size([1024, 2048])
Swapped experts at layer 0, top expert 0 with bottom expert 1


In [9]:
# Print weights for layer 0 expert 0
print_expert_weights(model, layer_idx=0, expert_idx=0)

print("\n" + "="*80 + "\n")  # Separator for readability

# Print weights for layer 0 expert 1
print_expert_weights(model, layer_idx=0, expert_idx=1)



Gate Projection:
tensor([[ 0.0047,  0.0144,  0.0098,  ..., -0.0136,  0.0004, -0.0096],
        [ 0.0166,  0.0229, -0.0249,  ...,  0.0093,  0.0170, -0.0167],
        [-0.0054, -0.0134,  0.0197,  ..., -0.0031,  0.0035,  0.0121],
        ...,
        [ 0.0183, -0.0057,  0.0187,  ..., -0.0131,  0.0139, -0.0266],
        [-0.0187,  0.0104,  0.0003,  ...,  0.0239, -0.0026, -0.0034],
        [ 0.0164,  0.0256, -0.0092,  ...,  0.0055, -0.0097,  0.0016]])

Up Projection:
tensor([[-0.0283, -0.0043,  0.0032,  ..., -0.0098,  0.0206,  0.0082],
        [ 0.0025, -0.0062, -0.0189,  ...,  0.0159, -0.0078,  0.0192],
        [ 0.0066, -0.0167, -0.0043,  ..., -0.0079, -0.0193, -0.0078],
        ...,
        [ 0.0074,  0.0029,  0.0161,  ..., -0.0123,  0.0339, -0.0276],
        [-0.0248,  0.0226,  0.0152,  ...,  0.0004,  0.0189,  0.0175],
        [ 0.0018,  0.0214, -0.0118,  ..., -0.0059, -0.0383,  0.0023]])

Down Projection:
tensor([[-0.0134,  0.0007,  0.0071,  ...,  0.0142, -0.0272, -0.0029],
        [-

In [7]:
# Test the model with a prompt
prompt = ("""    
Continue this text in a natural and coherent way, maintaining consistency with the style, 
terminology, and logical flow of the preceding text.
          
\\title{Quantum Error Mitigation in NISQ Devices}
\\begin{abstract}
We present a novel approach to error mitigation in noisy intermediate-scale quantum (NISQ) devices. 
Our method introduces a scaling framework for quantum channels that preserves gate fidelity while reducing environmental noise.
\end{abstract}
\section{Introduction}
Recent advances in NISQ devices have demonstrated both promise and limitations in quantum computation. 
The primary challenge remains decoherence, which introduces errors in quantum operations. We propose a channel scaling approach 
$\mathcal{N}(\\rho) = e^{-\lambda t}\\rho$ 
that provides a systematic way to
"""
)

# Convert the prompt to inputs and run a forward pass
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
# Generate output (since it's a causal LM, we need to generate text from input)
outputs = model.generate(
    inputs['input_ids'],  # Only provide input_ids to generate
    attention_mask=inputs['attention_mask'],  # Add attention mask to not attend to padding tokens
    max_new_tokens=156,    # Generate 100 new tokens
    temperature=0.6,       # Control randomness
    do_sample=True,        # Use sampling instead of greedy decoding
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id  # Set padding token
)

# Decode the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the original prompt and generated response
print("Prompt:", prompt)
print("\nGenerated response:", generated_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:     
Continue this text in a natural and coherent way, maintaining consistency with the style, 
terminology, and logical flow of the preceding text.
          
\title{Quantum Error Mitigation in NISQ Devices}
\begin{abstract}
We present a novel approach to error mitigation in noisy intermediate-scale quantum (NISQ) devices. 
Our method introduces a scaling framework for quantum channels that preserves gate fidelity while reducing environmental noise.
\end{abstract}
\section{Introduction}
Recent advances in NISQ devices have demonstrated both promise and limitations in quantum computation. 
The primary challenge remains decoherence, which introduces errors in quantum operations. We propose a channel scaling approach 
$\mathcal{N}(\rho) = e^{-\lambda t}\rho$ 
that provides a systematic way to


Generated response:     
Continue this text in a natural and coherent way, maintaining consistency with the style, 
terminology, and logical flow of the preceding text.
          
\title{Q

In [None]:
for key in model.state_dict().keys():
    print(key)

#### sorting expert dicts

In [7]:
# layer 14
example_dict = {60: 56, 58: 290, 24: 111, 6: 286, 36: 34, 11: 80, 9: 474, 52: 84, 34: 32, 19: 61, 7: 66, 1: 18, 50: 37, 4: 132, 5: 20, 22: 17, 29: 4, 42: 62, 17: 76, 54: 53, 46: 33, 51: 52, 26: 3, 39: 25, 47: 20, 13: 3, 18: 25, 23: 14, 16: 44, 28: 13, 12: 10, 57: 36, 40: 27, 63: 35, 35: 42, 38: 55, 48: 63, 20: 32, 15: 44, 31: 7, 44: 5, 14: 5, 41: 4, 45: 12, 33: 41, 62: 16, 10: 5, 56: 24, 61: 6, 37: 26, 53: 25, 49: 31, 59: 51, 21: 23, 3: 4, 8: 16, 25: 1, 2: 8, 43: 2, 0: 7, 32: 13, 27: 1, 55: 1}

# Sorting by values in decreasing order
sorted_items = sorted(example_dict.items(), key=lambda item: item[1], reverse=True)

# Extract keys from the sorted items
sorted_keys = [item[0] for item in sorted_items]
# Get the first 8 keys and the last 8 keys
first_8_keys = sorted_keys[:8]
last_8_keys = sorted_keys[-8:]

# Output the two lists
print("First 8 keys:", first_8_keys)
print("Last 8 keys:", last_8_keys)

First 8 keys: [17, 1, 34, 44, 50, 45, 30, 54]
Last 8 keys: [18, 15, 37, 26, 7, 38, 21, 51]
