# Steering MoE LLMs with Custom Datasets

In this tutorial, we show how to steer MoE LLMs using a custom dataset.
We focus on controlling whether the model outputs digits (1, 2, 3) or written numbers (one, two, three), and we can identify the relevant experts with just one example pair!

Steps:
1. Load a pre-trained MoE LLM.

2. Prepare a custom steering dataset.

3. Save routing activations on the dataset pairs.

4. Identify behavior-linked experts via risk difference.

5. (De)activate experts at inference to steer model behavior.

In [None]:
# Copyright 2022 Adobe
# All Rights Reserved.

# NOTICE: Adobe permits you to use, modify, and distribute this file in
# accordance with the terms of the Adobe license agreement accompanying
# it.


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HF_HOME"] = "/mnt/localssd/.hfcache/"
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
os.environ["TORCHDYNAMO_VERBOSE"] = "1"
os.environ["TRUST_REMOTE_CODE"] = "true"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

os.environ["TEMP_NPY_BASE_PATH"] = "./temp_routings/"

import sys
import torch
import argparse
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.auto import tqdm
from importlib import reload
from dotenv import load_dotenv
import huggingface_hub as hf_hub
from vllm import LLM, SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer

from src.utils import register_vllm_save_models, register_vllm_models, steer_moe

try:
    load_dotenv()
    hf_hub.login(os.environ["HF_TOKEN"])
except Exception as e:
    print("HF_TOKEN not found in environment variables. Continuing without login.")
    pass

if not os.path.exists(os.environ["TEMP_NPY_BASE_PATH"]):
    os.makedirs(os.environ["TEMP_NPY_BASE_PATH"])

INFO 09-02 18:12:53 [__init__.py:241] Automatically detected platform cuda.
HF_TOKEN not found in environment variables. Continuing without login.


# 1. Model

In [2]:
# Supported Models: 
# "Qwen/Qwen3-30B-A3B", "openai/gpt-oss-120b", 
# "microsoft/Phi-3.5-MoE-instruct", "openai/gpt-oss-20b", 
# "mistralai/Mixtral-8x7B-Instruct-v0.1", "allenai/OLMoE-1B-7B-0125-Instruct"
MODEL_NAME = "Qwen/Qwen3-30B-A3B"

register_vllm_save_models()
sampling_params = SamplingParams(temperature=0, top_p=0.8, top_k=1, min_p=0, max_tokens=1, seed=0)
llm = LLM(
    model=MODEL_NAME, 
    max_seq_len_to_capture=4000, max_model_len=4000, 
    tensor_parallel_size=torch.cuda.device_count(), gpu_memory_utilization=0.95, max_num_seqs=1,
    enforce_eager=True,
    enable_prefix_caching=False,
    trust_remote_code=True
)

INFO 09-02 18:12:54 [utils.py:326] non-default args: {'model': 'Qwen/Qwen3-30B-A3B', 'trust_remote_code': True, 'max_model_len': 4000, 'enable_prefix_caching': False, 'gpu_memory_utilization': 0.95, 'max_num_seqs': 1, 'disable_log_stats': True, 'enforce_eager': True, 'max_seq_len_to_capture': 4000}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 09-02 18:13:02 [__init__.py:711] Resolved architecture: Qwen3MoeForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-02 18:13:02 [__init__.py:1750] Using max model len 4000
INFO 09-02 18:13:05 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-02 18:13:05 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:06 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:06 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen3-30B-A3B', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disab

Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]


[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:29 [default_loader.py:262] Loading weights took 19.11 seconds
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:30 [gpu_model_runner.py:2007] Model loading took 56.8814 GiB and 20.010527 seconds
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:37 [gpu_worker.py:276] Available KV cache memory: 17.83 GiB
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:38 [kv_cache_utils.py:849] GPU KV cache size: 194,784 tokens
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:38 [kv_cache_utils.py:853] Maximum concurrency for 4,000 tokens per request: 48.70x
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:38 [core.py:214] init engine (profile, create kv cache, warmup model) took 8.06 seconds
[1;36m(EngineCore_0 pid=3336907)[0;0m INFO 09-02 18:13:39 [__init__.py:3565] Cudagraph is disabled under eager mode
INFO 09-02 18:13:39 [llm.py:298] Supported_tasks: ('generate',)




In [3]:
def get_routings(messages):
    """
    Get the routing logits for the given messages.
    """
    for layer in range(500):
        TEMP_NPY_PATH = f"{os.environ['TEMP_NPY_BASE_PATH']}/router_logits_L{layer}.npy"
        if os.path.exists(TEMP_NPY_PATH):
            os.remove(TEMP_NPY_PATH)
        
    outputs = llm.chat(messages, sampling_params, use_tqdm=False, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"})
    
    all_router_logits = []
    for layer in range(500):
        try:
            TEMP_NPY_PATH = f"{os.environ['TEMP_NPY_BASE_PATH']}/router_logits_L{layer}.npy"
            router_logits = np.load(TEMP_NPY_PATH).astype(np.float16)
            all_router_logits.append(router_logits)
        except FileNotFoundError:
            continue

    all_router_logits = np.stack(all_router_logits, axis=0)  # (num_layers, num_tokens, n_experts)
    output = {
        "router_logits": all_router_logits.astype(np.float16),  # (num_layers, num_tokens, n_experts)
        "messages": messages,
        "prompt_token_ids": outputs[0].prompt_token_ids,
    }
    return output

messages = [
    [{"role": "user", "content": "Hello"},]
]
r = get_routings(messages)
print(r.keys(), len(r["prompt_token_ids"]))
r["router_logits"].shape, r["router_logits"][:2, :2, :2]

INFO 09-02 18:13:41 [chat_utils.py:470] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
dict_keys(['router_logits', 'messages', 'prompt_token_ids']) 13


((48, 13, 128),
 array([[[-4.062, -3.86 ],
         [-4.22 , -5.594]],
 
        [[-3.188, -4.562],
         [-4.97 , -4.688]]], dtype=float16))

# 2. Dataset

In [4]:
### - messages_0: The messages for the first behavior response (ex safe)
### - messages_1: The messages for the second behavior response (ex unsafe)
### - messages_0_target: The target string for the first behavior response (Which tokens to compare routings)
### - messages_1_target: The target string for the second behavior response (Which tokens to compare routings)
DATASET_NAME = "custom_dataset"

df_ds = pd.DataFrame([
    {
        "messages_0": [{"role": "user", "content": "Count to ten"}, {"role": "assistant", "content": "1, 2, 3, 4, 5, 6, 7, 8, 9, 10"}],
        "messages_1": [{"role": "user", "content": "Count to ten"}, {"role": "assistant", "content": "one, two, three, four, five, six, seven, eight, nine, ten"}],
        "messages_0_target": "1, 2, 3, 4, 5, 6, 7, 8, 9, 10",
        "messages_1_target": "one, two, three, four, five, six, seven, eight, nine, ten",
    },
])

df_ds

Unnamed: 0,messages_0,messages_1,messages_0_target,messages_1_target
0,"[{'role': 'user', 'content': 'Count to ten'}, ...","[{'role': 'user', 'content': 'Count to ten'}, ...","1, 2, 3, 4, 5, 6, 7, 8, 9, 10","one, two, three, four, five, six, seven, eight..."


# 3. Save Routings

In [5]:
for PAIR_CHOICE in ["messages_0", "messages_1"]:
    def find_sub_list(sl,l):
        results = []
        sll = len(sl)
        for ind in (i for i,e in enumerate(l) if e == sl[0]):
            if l[ind:ind+sll] == sl:
                results.append((ind, ind + sll - 1))
        return results

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    outputs_list = []
    for i in tqdm(range(len(df_ds))):
        # Get routings
        messages = df_ds.iloc[i][PAIR_CHOICE]
        outputs = get_routings(messages)  # 'router_logits', 'messages', 'prompt_token_ids'
        # Check shapes
        num_layers, num_tokens, n_experts = outputs["router_logits"].shape
        assert num_tokens == len(outputs["prompt_token_ids"])
        print(outputs["router_logits"].shape)
        # Store prompt tokens
        outputs["prompt_tokens"] = tokenizer.convert_ids_to_tokens(outputs["prompt_token_ids"], skip_special_tokens=False)
        outputs["prompt_tokens_special_mask"] = tokenizer.get_special_tokens_mask(
            outputs["prompt_token_ids"], already_has_special_tokens=True,
        )
        # Store the messages in a tokenized format
        outputs["messages_tokenized"] = [{
            "role": message["role"], 
            "content_token_ids": tokenizer(message["content"], add_special_tokens=False).input_ids,
            "content_tokens": tokenizer.convert_ids_to_tokens(tokenizer(message["content"], add_special_tokens=False).input_ids)
        } for message in messages]
        # Store the target texts and their tokenized forms (for detection comparison on these tokens)
        for col in [f"{PAIR_CHOICE}_target"]:
            target_text = df_ds.iloc[i][col]
            target_token_ids = tokenizer(df_ds.iloc[i][col], add_special_tokens=False).input_ids
            target_tokens = tokenizer.convert_ids_to_tokens(target_token_ids, skip_special_tokens=False)
            locations = find_sub_list(target_token_ids, outputs["prompt_token_ids"])
            assert len(locations) >= 1, f"Expected exactly one location: {locations}, for target text: \n{target_tokens}, in prompt tokens: \n{outputs['prompt_tokens']}"
            if len(locations) > 1:
                print(f"Expected exactly one location: {locations}, for target text: \n{target_tokens}, in prompt tokens: \n{outputs['prompt_tokens']}")
                print("Using last one")
                locations[0] = locations[-1]  # Use the last location if there are multiple
            outputs[col] = {
                "text": target_text,
                "tokens": target_tokens,
                "token_ids": target_token_ids,
                "start_idx": locations[0][0] if locations else None,
                "end_idx": locations[0][1] if locations else None,
            }
        # Append the outputs for this example to the list
        outputs_list.append(outputs)

    def get_model_num_experts(self):
        model = self.model_runner.model
        if hasattr(model, "model_config") and hasattr(model.model_config, "num_experts_per_tok"):
            return model.model_config.num_experts_per_tok  # gpt-oss
        elif hasattr(model.config, "num_experts_per_tok"):
            num_experts_per_tok = f"{model.config.num_experts_per_tok}"
        else:
            num_experts_per_tok = f"{model.config.text_config.num_experts_per_tok}"  # llama4
        return num_experts_per_tok
    num_experts_per_tok = llm.collective_rpc(get_model_num_experts)[0]
    print(num_experts_per_tok)
    print(outputs_list[0].keys())
    df = pd.DataFrame(outputs_list)
    df.attrs = {
        "model_name": MODEL_NAME,
        "dataset_name": DATASET_NAME,
        "doc_choice": PAIR_CHOICE,
        "num_experts": n_experts,
        "num_experts_per_tok": num_experts_per_tok,
        "col_names": {},
    }
    path = f"output_[{MODEL_NAME.replace('/', '--')}]_[{DATASET_NAME}]_[{PAIR_CHOICE}]_[{len(df)}].pkl"
    df.to_pickle(path)
    print("### SAVED ROUTINGS AT:", path)
    print(len(df))
    print(df.attrs)
    df.head(2)

  0%|          | 0/1 [00:00<?, ?it/s]

(48, 53, 128)
8
dict_keys(['router_logits', 'messages', 'prompt_token_ids', 'prompt_tokens', 'prompt_tokens_special_mask', 'messages_tokenized', 'messages_0_target'])
### SAVED ROUTINGS AT: output_[Qwen--Qwen3-30B-A3B]_[custom_dataset]_[messages_0]_[1].pkl
1
{'model_name': 'Qwen/Qwen3-30B-A3B', 'dataset_name': 'custom_dataset', 'doc_choice': 'messages_0', 'num_experts': 128, 'num_experts_per_tok': '8', 'col_names': {}}


  0%|          | 0/1 [00:00<?, ?it/s]

(48, 43, 128)
8
dict_keys(['router_logits', 'messages', 'prompt_token_ids', 'prompt_tokens', 'prompt_tokens_special_mask', 'messages_tokenized', 'messages_1_target'])
### SAVED ROUTINGS AT: output_[Qwen--Qwen3-30B-A3B]_[custom_dataset]_[messages_1]_[1].pkl
1
{'model_name': 'Qwen/Qwen3-30B-A3B', 'dataset_name': 'custom_dataset', 'doc_choice': 'messages_1', 'num_experts': 128, 'num_experts_per_tok': '8', 'col_names': {}}


# 4. Detect Experts

In [6]:
dfs = {
    "messages_0": pd.read_pickle(f"output_[{MODEL_NAME.replace('/', '--')}]_[{DATASET_NAME}]_[messages_0]_[{len(df)}].pkl"),
    "messages_1": pd.read_pickle(f"output_[{MODEL_NAME.replace('/', '--')}]_[{DATASET_NAME}]_[messages_1]_[{len(df)}].pkl"),
}

In [7]:
TOKEN_REDUCE_FN = "rd"

def find_sub_list(sl,l):
    results = []
    sll = len(sl)
    for ind in (i for i,e in enumerate(l) if e == sl[0]):
        if l[ind:ind+sll] == sl:
            results.append((ind, ind + sll - 1))
    return results  # [start_idx, end_idx]

def get_router_prob_n2(row):
    """Get the probability of each expert selected by the router for a given token."""
    router_logits = torch.tensor(row["router_logits"])  # (layer, token, expert)
    router_prob = torch.nn.functional.softmax(router_logits, dim=-1)  # (layer, token, expert)
    return router_prob.cpu().numpy()  # (layer, token, expert)

for key in tqdm(dfs.keys()):
    dfs[key]["router_prob_n2"] = dfs[key].apply(get_router_prob_n2, axis=1)  # (layer, token, expert)

# Concat router freq for all examples
key_df_1 = "messages_0"
key_df_2 = "messages_1"
print(f"key_df_1: {key_df_1}, key_df_2: {key_df_2}")
freq = {key_df_1: [], key_df_2: []}
tokens = {key_df_1: [], key_df_2: []}

debug_example_starts = []
num_used_examples = 0
for row_idx in tqdm(range(0, len(dfs[key_df_1]))):
    router_prob_n2_1 = dfs[key_df_1].iloc[row_idx]["router_prob_n2"]  # (layer, token, expert)
    router_prob_n2_2 = dfs[key_df_2].iloc[row_idx]["router_prob_n2"]  # (layer, token, expert)
    num_tokens_1, num_tokens_2 = router_prob_n2_1.shape[1], router_prob_n2_2.shape[1]

    subset_1 = dfs[key_df_1].iloc[row_idx]["messages_0_target"]["token_ids"]
    subset_2 = dfs[key_df_2].iloc[row_idx]["messages_1_target"]["token_ids"]
    range_1 = find_sub_list(subset_1, dfs[key_df_1].iloc[row_idx]["prompt_token_ids"])
    range_2 = find_sub_list(subset_2, dfs[key_df_2].iloc[row_idx]["prompt_token_ids"])
    assert len(range_1) >= 1 and len(range_2) >= 1, f"Expected more than one range for each dataset, got {len(range_1)} and {len(range_2)}"
    range_1 = range_1[-1]
    range_2 = range_2[-1]
    num_used_examples += 1
    debug_example_starts.append(len(freq[key_df_1]))

    for token_1_idx in range(range_1[0], range_1[1] + 1):
        freq[key_df_1].append(router_prob_n2_1[:, token_1_idx, :])
        tokens[key_df_1].append(dfs[key_df_1].iloc[row_idx]['prompt_tokens'][token_1_idx])

    for token_2_idx in range(range_2[0], range_2[1] + 1):
        freq[key_df_2].append(router_prob_n2_2[:, token_2_idx, :])
        tokens[key_df_2].append(dfs[key_df_2].iloc[row_idx]['prompt_tokens'][token_2_idx])
    
    if len(freq[key_df_1]) > 2000000:
        print("Reached 2M token comparisons, stopping...")
        break

print(len(freq[key_df_1]))
freq[key_df_1] = np.stack(freq[key_df_1])
freq[key_df_2] = np.stack(freq[key_df_2])
print(freq[key_df_1].shape, freq[key_df_2].shape)
print(f"Used examples: {num_used_examples}")

if "eq" in TOKEN_REDUCE_FN:
    # Equalize the number of tokens in both datasets
    min_tokens = min(len(freq[key_df_1]), len(freq[key_df_2]))
    freq[key_df_1] = freq[key_df_1][:min_tokens]
    freq[key_df_2] = freq[key_df_2][:min_tokens]
    tokens[key_df_1] = tokens[key_df_1][:min_tokens]
    tokens[key_df_2] = tokens[key_df_2][:min_tokens]
    print(freq[key_df_1].shape, freq[key_df_2].shape)
# dfs['safe'].head(2)

  0%|          | 0/2 [00:00<?, ?it/s]

key_df_1: messages_0, key_df_2: messages_1


  0%|          | 0/1 [00:00<?, ?it/s]

29
(29, 48, 128) (19, 48, 128)
Used examples: 1


In [8]:
from scipy.stats import ttest_rel

NUM_EXPERTS_PER_TOK = int(dfs[key_df_1].attrs["num_experts_per_tok"])
print(f"Number of experts per token: {NUM_EXPERTS_PER_TOK}")

def calc_risk_diff(prob1, prob2):
    ### prob1,2 = (batch, layer, expert)
    ### Count how many times each expert is activated
    a1, a2, d1, d2 = np.zeros((prob1.shape[1], prob1.shape[2])), np.zeros((prob2.shape[1], prob2.shape[2])), np.zeros((prob1.shape[1], prob1.shape[2])), np.zeros((prob2.shape[1], prob2.shape[2]))
    pre_processed_act1 = np.argsort(prob1, axis=-1)  # Get top experts
    pre_processed_act2 = np.argsort(prob2, axis=-1)  # Get top experts
    
    for token_idx in tqdm(range(prob1.shape[0])):
        for layer in range(prob1.shape[1]):
            activated_experts_1 = pre_processed_act1[token_idx, layer, -NUM_EXPERTS_PER_TOK:]  # Get top 8 experts
            a1[layer, activated_experts_1] += 1
            deactivated_experts_1 = pre_processed_act1[token_idx, layer, :-NUM_EXPERTS_PER_TOK]  # Experts not activated in prob1
            d1[layer, deactivated_experts_1] += 1
            assert len(activated_experts_1) + len(deactivated_experts_1) == prob1.shape[2]  # num experts
    
    for token_idx in tqdm(range(prob2.shape[0])):
        for layer in range(prob2.shape[1]):
            activated_experts_2 = pre_processed_act2[token_idx, layer, -NUM_EXPERTS_PER_TOK:]
            a2[layer, activated_experts_2] += 1
            deactivated_experts_2 = pre_processed_act2[token_idx, layer, :-NUM_EXPERTS_PER_TOK]  # Experts not activated in prob2
            d2[layer, deactivated_experts_2] += 1
            assert len(activated_experts_2) + len(deactivated_experts_2) == prob2.shape[2]  # num experts

    layer_expert_paired_ttest = []
    for layer in tqdm(range(prob1.shape[1])):
        for expert in range(prob1.shape[2]):
            test_results = {
                "layer": layer,
                "expert": expert,
                "a1": a1[layer, expert],
                "a2": a2[layer, expert],
                "d1": d1[layer, expert],
                "d2": d2[layer, expert],
                "a1_n": (a1[layer, expert] / (a1[layer, expert] + d1[layer, expert])),
                "a2_n": (a2[layer, expert] / (a2[layer, expert] + d2[layer, expert])),
                "risk_diff": (a1[layer, expert] / (a1[layer, expert] + d1[layer, expert])) - (a2[layer, expert] / (a2[layer, expert] + d2[layer, expert]))
            }
            layer_expert_paired_ttest.append(test_results)
    return pd.DataFrame(layer_expert_paired_ttest)


subset1, subset2 = "messages_0", "messages_1"  # "random_doc"
df = calc_risk_diff(freq[subset1], freq[subset2])
df["Layer_Expert"] = df.apply(lambda x: f"L{int(x['layer']):02d}\nE{int(x['expert']):02d}", axis=1)

df["risk_diff_abs"] = df["risk_diff"].abs()
df = df.sort_values(by="risk_diff_abs", ascending=False).reset_index(drop=True)

path = f"activations_[{dfs[subset1].attrs['model_name'].replace('/', '--')}]_[{dfs[subset1].attrs['dataset_name']}]_[{TOKEN_REDUCE_FN}]_[{len(dfs[key_df_1])}]_[{len(freq[subset1])}].pkl"
df.to_pickle(path)
print(f"Saved to {path}")

df

Number of experts per token: 8


  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Saved to activations_[Qwen--Qwen3-30B-A3B]_[custom_dataset]_[rd]_[1]_[29].pkl


Unnamed: 0,layer,expert,a1,a2,d1,d2,a1_n,a2_n,risk_diff,Layer_Expert,risk_diff_abs
0,45,20,2.0,18.0,27.0,1.0,0.068966,0.947368,-0.878403,L45\nE20,0.878403
1,45,41,24.0,0.0,5.0,19.0,0.827586,0.000000,0.827586,L45\nE41,0.827586
2,5,22,7.0,19.0,22.0,0.0,0.241379,1.000000,-0.758621,L05\nE22,0.758621
3,42,30,7.0,18.0,22.0,1.0,0.241379,0.947368,-0.705989,L42\nE30,0.705989
4,40,115,7.0,18.0,22.0,1.0,0.241379,0.947368,-0.705989,L40\nE115,0.705989
...,...,...,...,...,...,...,...,...,...,...,...
6139,18,107,0.0,0.0,29.0,19.0,0.000000,0.000000,0.000000,L18\nE107,0.000000
6140,18,106,0.0,0.0,29.0,19.0,0.000000,0.000000,0.000000,L18\nE106,0.000000
6141,18,105,0.0,0.0,29.0,19.0,0.000000,0.000000,0.000000,L18\nE105,0.000000
6142,18,104,0.0,0.0,29.0,19.0,0.000000,0.000000,0.000000,L18\nE104,0.000000


# 5. Steer MoE LLM using Detected Experts

In [1]:
### RESTART HERE TO FREE GPU MEMORY FOR THE NEXT STEPS

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HF_HOME"] = "/mnt/localssd/.hfcache/"
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
os.environ["TORCHDYNAMO_VERBOSE"] = "1"
os.environ["TRUST_REMOTE_CODE"] = "true"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

import sys
import torch
import argparse
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.auto import tqdm
from importlib import reload
from dotenv import load_dotenv
import huggingface_hub as hf_hub
from vllm import LLM, SamplingParams

from src.utils import register_vllm_models, steer_moe

INFO 09-02 18:13:59 [__init__.py:241] Automatically detected platform cuda.


In [3]:
MODEL = "Qwen/Qwen3-30B-A3B"
DATASET = "custom_dataset"
config = {
    "model": MODEL,
    "task": DATASET,
    "max_tokens": 64,

    "activations_path": f"activations_[{MODEL.replace('/', '--')}]_[{DATASET}]_[rd]_[1]_[29].pkl",
    "num_pos_experts": 0,  # Adjust these based on the model and task
    "num_neg_experts": 250,  # Adjust these based on the model and task
}
config

{'model': 'Qwen/Qwen3-30B-A3B',
 'task': 'custom_dataset',
 'max_tokens': 64,
 'activations_path': 'activations_[Qwen--Qwen3-30B-A3B]_[custom_dataset]_[rd]_[1]_[29].pkl',
 'num_pos_experts': 0,
 'num_neg_experts': 250}

In [4]:
register_vllm_models()

llm = LLM(
    model=MODEL, 
    max_seq_len_to_capture=4096, max_model_len=4096, 
    tensor_parallel_size=torch.cuda.device_count(), gpu_memory_utilization=0.95, max_num_seqs=1,
    enforce_eager=True,
    enable_prefix_caching=False,
    trust_remote_code=True,
)

INFO 09-02 18:14:00 [utils.py:326] non-default args: {'model': 'Qwen/Qwen3-30B-A3B', 'trust_remote_code': True, 'max_model_len': 4096, 'enable_prefix_caching': False, 'gpu_memory_utilization': 0.95, 'max_num_seqs': 1, 'disable_log_stats': True, 'enforce_eager': True, 'max_seq_len_to_capture': 4096}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 09-02 18:14:08 [__init__.py:711] Resolved architecture: Qwen3MoeForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-02 18:14:08 [__init__.py:1750] Using max model len 4096
INFO 09-02 18:14:10 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-02 18:14:10 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:12 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:12 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen3-30B-A3B', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disab

[1;36m(EngineCore_0 pid=3338166)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48


[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:16 [weight_utils.py:296] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]


[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:32 [default_loader.py:262] Loading weights took 15.27 seconds
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:33 [gpu_model_runner.py:2007] Model loading took 56.8820 GiB and 16.936916 seconds
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:38 [gpu_worker.py:276] Available KV cache memory: 17.83 GiB
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:39 [kv_cache_utils.py:849] GPU KV cache size: 194,784 tokens
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:39 [kv_cache_utils.py:853] Maximum concurrency for 4,096 tokens per request: 47.55x
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:39 [core.py:214] init engine (profile, create kv cache, warmup model) took 6.64 seconds
[1;36m(EngineCore_0 pid=3338166)[0;0m INFO 09-02 18:14:46 [__init__.py:3565] Cudagraph is disabled under eager mode
INFO 09-02 18:14:46 [llm.py:298] Supported_tasks: ('generate',)


[1;36m(EngineCore_0 pid=3338166)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48




[1;36m(EngineCore_0 pid=3338166)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48
[1;36m(EngineCore_0 pid=3338166)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48


ERROR 09-02 18:18:06 [core_client.py:562] Engine core proc EngineCore_0 died unexpectedly, shutting down client.


In [5]:
batch_messages = [
    [
        {
            "role": "user", 
            "content": "Count to fifteen.",
        }
    ],
]

In [6]:
### Before Steering
paired_ttest_df = steer_moe(
    llm, config["activations_path"],
    num_pos_experts=0, num_neg_experts=0,
    steering_magnitude=1000, reverse_effect=0, strategy="risk_diff"
)
sampling_params = SamplingParams(temperature=0.0, top_p=1, top_k=1, min_p=0, max_tokens=config["max_tokens"], seed=0)
outputs = llm.chat(batch_messages, sampling_params, use_tqdm=True, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"},)
generations = [output.outputs[0].text for output in outputs]
print("### Before Steering:")
print(generations)

MAX EXPERTS: 1175 949
##### Total Experts: 6144, Layers: 48, Experts: 128
##### Num Experts: 0, Steering Magnitude: 1000, Reverse Effect: 0, pos_num_experts: 0, neg_num_experts: 0, metric=risk_diff, strategy: risk_diff

INFO 09-02 18:14:55 [chat_utils.py:470] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

### Before Steering:
["Sure! Here's the count from one to fifteen:\n\n1. One  \n2. Two  \n3. Three  \n4. Four  \n5. Five  \n6. Six  \n7. Seven  \n8. Eight  \n9. Nine  \n10. Ten  \n11. Eleven  \n12. Twelve  \n13"]


In [7]:
### After Steering Towards Digits
paired_ttest_df = steer_moe(
    llm, config["activations_path"],
    num_pos_experts=config["num_pos_experts"], num_neg_experts=config["num_neg_experts"],
    steering_magnitude=1000, reverse_effect=0, strategy="risk_diff"
)
sampling_params = SamplingParams(temperature=0.0, top_p=1, top_k=1, min_p=0, max_tokens=config["max_tokens"], seed=0)
outputs = llm.chat(batch_messages, sampling_params, use_tqdm=True, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"},)
generations = [output.outputs[0].text for output in outputs]
print("### After Steering Towards Digits:")
print(generations)

MAX EXPERTS: 1175 949
##### Total Experts: 6144, Layers: 48, Experts: 128
##### Num Experts: 250, Steering Magnitude: 1000, Reverse Effect: 0, pos_num_experts: 0, neg_num_experts: 250, metric=risk_diff, strategy: risk_diff



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

### After Steering Towards Digits:
['1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15.']


In [8]:
### After Steering Away from Digits
paired_ttest_df = steer_moe(
    llm, config["activations_path"],
    num_pos_experts=config["num_pos_experts"], num_neg_experts=config["num_neg_experts"],
    steering_magnitude=1000, reverse_effect=1, strategy="risk_diff"
)
sampling_params = SamplingParams(temperature=0.0, top_p=1, top_k=1, min_p=0, max_tokens=config["max_tokens"], seed=0)
outputs = llm.chat(batch_messages, sampling_params, use_tqdm=True, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"},)
generations = [output.outputs[0].text for output in outputs]
print("### After Steering Away from Digits:")
print(generations)

MAX EXPERTS: 949 1175
##### Total Experts: 6144, Layers: 48, Experts: 128
##### Num Experts: 250, Steering Magnitude: 1000, Reverse Effect: 1, pos_num_experts: 0, neg_num_experts: 250, metric=risk_diff, strategy: risk_diff



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

### After Steering Away from Digits:
['One, two, three, four, five, six, seven, eight, nine, ten, eleven, twelve, thirteen, fourteen, fifteen.']
