In [None]:
from transformers import AutoModel

# 加载模型
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModel.from_pretrained(model_name, cache_dir="/cache/huggingface/hub")

# 计算总参数量
total_params = sum(p.numel() for p in model.parameters())

print(f"模型的总参数量: {total_params}")

# 比较模型参数差异并提取位置

In [1]:
from typing import Union, List
from pprint import pprint
import torch

from transformers import AutoModelForCausalLM


def compare_mlp_params(model1, model2, mlp_layers: Union[str, List[str]]) -> dict:
    """比较多个MLP层的参数差异

    Args:
        mlp_layers: 支持以下格式：
            - 单个层模式: "transformer.h.0.mlp"
            - 多个层模式: ["transformer.h.0.mlp", "transformer.h.5.mlp"]
    """
    # 统一处理为列表格式
    if isinstance(mlp_layers, str):
        target_patterns = [mlp_layers]
    else:
        target_patterns = mlp_layers

    # 多模式参数提取
    def filter_params(model):
        return {
            name: param
            for name, param in model.named_parameters()
            if any(pattern in name for pattern in target_patterns)
        }

    params1 = filter_params(model1)
    params2 = filter_params(model2)

    # 结构一致性检查
    if params1.keys() != params2.keys():
        missing_in_1 = set(params2.keys()) - set(params1.keys())
        missing_in_2 = set(params1.keys()) - set(params2.keys())
        raise ValueError(
            f"模型结构不一致\n"
            f"Model1缺失层: {list(missing_in_1)}\n"
            f"Model2缺失层: {list(missing_in_2)}"
        )

    differences = {}

    for name in params1:
        p1, p2 = params1[name].cpu(), params2[name].cpu()

        if p1.shape != p2.shape:
            raise ValueError(f"形状不匹配: {name} | {p1.shape} vs {p2.shape}")

        if not torch.equal(p1, p2):
            diff_mask = ~torch.isclose(p1, p2, rtol=1e-5, atol=1e-8)
            diff_indices = torch.unique(torch.nonzero(diff_mask)[:, 0])

            differences[name] = {
                "shape": tuple(p1.shape),
                "diff_count": diff_indices.size(0),
                "diff_ratio": diff_indices.size(0) / p1.numel(),
                "diff_indices": diff_indices.tolist(),
            }

    return differences


# 使用示例
if __name__ == "__main__":
    # 单层比较
    # diff = compare_mlp_params(model1_path, model2_path, "transformer.h.0.mlp")
    model1_path = "Qwen/Qwen2.5-0.5B-Instruct"
    model2_path = "/cache/models/loki_reranker_qwen2_5-0-5b-40_real"
    model1 = AutoModelForCausalLM.from_pretrained(
        model1_path, torch_dtype=torch.bfloat16
    )
    model2 = AutoModelForCausalLM.from_pretrained(
        model2_path, torch_dtype=torch.bfloat16
    )
    # for name, param in model1.named_parameters():
    #     print(name)
    target_modules = []
    for idx, layer in enumerate(model1.model.layers):
        module_str = f"model.layers.{idx}.mlp.down_proj.weight"
        target_modules.append(module_str)
    # 多层比较
    diff = compare_mlp_params(model1, model2, mlp_layers=target_modules)
    pprint(diff)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


{'model.layers.0.mlp.down_proj.weight': {'diff_count': 55,
                                         'diff_indices': [16,
                                                          25,
                                                          32,
                                                          41,
                                                          54,
                                                          79,
                                                          91,
                                                          95,
                                                          146,
                                                          205,
                                                          207,
                                                          212,
                                                          216,
                                                          218,
                                                          231,
    

# 打印模型结构

In [None]:
from transformers import AutoModel

# 加载模型
model_name = "/cache/models/loki_reranker_qwen2_5-0-5b-5_real"
model = AutoModel.from_pretrained(model_name)

print(model)

Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)


In [None]:
from safetensors import safe_open
with safe_open("/cache/models/loki_reranker_qwen2_5-0-5b-5/checkpoint-456688/model.safetensors", framework="pt") as f:
    print(f.keys())  # 直接输出所有权重键名

# 还原模型

In [None]:
from safetensors import safe_open
from transformers import AutoModelForCausalLM
from src.module.loki_linear import LoKILinear
import json
import torch

checkpoint_path = (
    "/cache/models/loki_reranker_qwen2_5-0-5b-5/checkpoint-456688/model.safetensors"
)
target_neurons_path = "target_neurons/Qwen2.5-0.5B-Instruct/5.json"
target_model = "/cache/models/loki_reranker_qwen2_5-0-5b-5/checkpoint-456688"

with open(target_neurons_path, "r", encoding="utf-8") as f:
    data = json.load(f)
trainable_neurons = list(data)
# 重新初始化原始模型结构
original_model = AutoModelForCausalLM.from_pretrained(target_model)


def merge_loki_weights(loki_layer, original_linear):
    # 合并权重矩阵
    merged_weight = torch.zeros_like(original_linear.weight.data)
    merged_weight[loki_layer.active_pos] = loki_layer.active_part.weight.data
    merged_weight[loki_layer.fixed_pos] = loki_layer.fixed_part.weight.data

    # 合并偏置项
    if original_linear.bias is not None:
        merged_bias = torch.zeros_like(original_linear.bias.data)
        merged_bias[loki_layer.active_pos] = loki_layer.active_bias.data
        merged_bias[loki_layer.fixed_pos] = loki_layer.fixed_bias.data

    # 加载参数到原始层
    original_linear.weight.data.copy_(merged_weight)
    if original_linear.bias is not None:
        original_linear.bias.data.copy_(merged_bias)


# 加载检查点文件

# 遍历所有层还原参数
for layer_idx in range(original_model.config.num_hidden_layers):
    # 获取当前层的原始结构
    original_down_proj = original_model.model.layers[layer_idx].mlp.down_proj

    # 加载LoKI层参数
    with safe_open(checkpoint_path, framework="pt") as f:
        # 创建临时LoKI层用于加载参数
        loki_layer = LoKILinear(
            original_down_proj, target_neurons=trainable_neurons[layer_idx]
        )
        loki_layer.load_state_dict(
            {
                "active_part.weight": f.get_tensor(
                    f"model.layers.{layer_idx}.mlp.down_proj.active_part.weight"
                ),
                "fixed_part.weight": f.get_tensor(
                    f"model.layers.{layer_idx}.mlp.down_proj.fixed_part.weight"
                ),
                # "active_bias": f.get_tensor(
                #     f"model.layers.{layer_idx}.mlp.down_proj.active_bias"
                # ),
                # "fixed_bias": f.get_tensor(
                #     f"model.layers.{layer_idx}.mlp.down_proj.fixed_bias"
                # ),
            },
            strict=True,
        )
        weight = f.get_tensor(f"model.layers.{layer_idx}.mlp.down_proj.active_part.weight")

    # 合并参数到原始层
    merge_loki_weights(loki_layer, original_down_proj)

# 保存还原后的模型
original_model.save_pretrained("/cache/models/loki_reranker_qwen2_5-0-5b-5_real")

Some weights of the model checkpoint at /cache/models/loki_reranker_qwen2_5-0-5b-5/checkpoint-456688 were not used when initializing Qwen2ForCausalLM: {'model.layers.0.mlp.down_proj.active_part.weight', 'model.layers.14.mlp.down_proj.fixed_part.weight', 'model.layers.7.mlp.down_proj.active_part.weight', 'model.layers.21.mlp.down_proj.active_part.weight', 'model.layers.4.mlp.down_proj.fixed_part.weight', 'model.layers.5.mlp.down_proj.fixed_part.weight', 'model.layers.2.mlp.down_proj.active_part.weight', 'model.layers.6.mlp.down_proj.fixed_part.weight', 'model.layers.1.mlp.down_proj.active_part.weight', 'model.layers.8.mlp.down_proj.active_part.weight', 'model.layers.18.mlp.down_proj.active_part.weight', 'model.layers.22.mlp.down_proj.fixed_part.weight', 'model.layers.3.mlp.down_proj.fixed_part.weight', 'model.layers.15.mlp.down_proj.fixed_part.weight', 'model.layers.16.mlp.down_proj.active_part.weight', 'model.layers.23.mlp.down_proj.fixed_part.weight', 'model.layers.17.mlp.down_proj.ac

In [5]:
from safetensors import safe_open
from transformers import AutoModelForCausalLM
from src.loki.loki_linear import LoKILinear
import json
import torch

checkpoint_path = (
    "/cache/models/loki_reranker_qwen2_5-0-5b-40/model.safetensors"
)
target_neurons_path = "target_neurons/Qwen2.5-0.5B-Instruct/40.json"
target_model = "/cache/models/loki_reranker_qwen2_5-0-5b-40"

with open(target_neurons_path, "r", encoding="utf-8") as f:
    data = json.load(f)
trainable_neurons = list(data)
# 重新初始化原始模型结构
original_model = AutoModelForCausalLM.from_pretrained(target_model)


def merge_loki_weights(loki_layer, original_linear):
    # 合并权重矩阵
    merged_weight = torch.zeros_like(original_linear.weight.data)
    merged_weight[loki_layer.active_pos] = loki_layer.active_weight.data
    merged_weight[loki_layer.fixed_pos] = loki_layer.fixed_weight.data

    # 合并偏置项
    if original_linear.bias is not None:
        merged_bias = torch.zeros_like(original_linear.bias.data)
        merged_bias[loki_layer.active_pos] = loki_layer.active_bias.data
        merged_bias[loki_layer.fixed_pos] = loki_layer.fixed_bias.data

    # 加载参数到原始层
    original_linear.weight.data.copy_(merged_weight)
    if original_linear.bias is not None:
        original_linear.bias.data.copy_(merged_bias)


# 加载检查点文件

# 遍历所有层还原参数
for layer_idx in range(original_model.config.num_hidden_layers):
    # 获取当前层的原始结构
    original_down_proj = original_model.model.layers[layer_idx].mlp.down_proj

    # 加载LoKI层参数
    with safe_open(checkpoint_path, framework="pt") as f:
        # 创建临时LoKI层用于加载参数
        loki_layer = LoKILinear(
            original_down_proj, target_neurons=trainable_neurons[layer_idx]
        )
        loki_layer.load_state_dict(
            {
                "active_weight": f.get_tensor(
                    f"model.layers.{layer_idx}.mlp.down_proj.active_weight"
                ),
                "fixed_weight": f.get_tensor(
                    f"model.layers.{layer_idx}.mlp.down_proj.fixed_weight"
                ),
                "index_map" : loki_layer.index_map
                # "active_bias": f.get_tensor(
                #     f"model.layers.{layer_idx}.mlp.down_proj.active_bias"
                # ),
                # "fixed_bias": f.get_tensor(
                #     f"model.layers.{layer_idx}.mlp.down_proj.fixed_bias"
                # ),
            },
            strict=True,
        )
        weight = f.get_tensor(f"model.layers.{layer_idx}.mlp.down_proj.active_weight")

    # 合并参数到原始层
    merge_loki_weights(loki_layer, original_down_proj)

# 保存还原后的模型
original_model.save_pretrained("/cache/models/loki_reranker_qwen2_5-0-5b-40_real")

Some weights of Qwen2ForCausalLM were not initialized from the model checkpoint at /cache/models/loki_reranker_qwen2_5-0-5b-40 and are newly initialized: ['model.layers.0.mlp.down_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.11.mlp.down_proj.weight', 'model.layers.12.mlp.down_proj.weight', 'model.layers.13.mlp.down_proj.weight', 'model.layers.14.mlp.down_proj.weight', 'model.layers.15.mlp.down_proj.weight', 'model.layers.16.mlp.down_proj.weight', 'model.layers.17.mlp.down_proj.weight', 'model.layers.18.mlp.down_proj.weight', 'model.layers.19.mlp.down_proj.weight', 'model.layers.2.mlp.down_proj.weight', 'model.layers.20.mlp.down_proj.weight', 'model.layers.21.mlp.down_proj.weight', 'model.layers.22.mlp.down_proj.weight', 'model.layers.23.mlp.down_proj.weight', 'model.layers.3.mlp.down_proj.weight', 'model.layers.4.mlp.down_proj.weight', 'model.layers.5.mlp.down_proj.weight', 'model.layers.6.mlp.down_proj.weight', 'model.layer

[2025-03-26 23:49:59,751] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'

In [1]:
from src.train.qwen_loki import LoKIQwen2ForCausalLM
import json
from transformers import AutoModelForCausalLM
import torch

target_neurons_path = "target_neurons/Qwen2.5-0.5B-Instruct/5.json"
original_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=torch.bfloat16
)
with open(target_neurons_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# custom_config = LoKIQwen2Config("Qwen/Qwen2.5-0.5B-Instruct", target_neurons=data)
model = LoKIQwen2ForCausalLM.from_pretrained(
    pretrained_model_name_or_path="Qwen/Qwen2.5-0.5B-Instruct",
    target_neurons=data,
    torch_dtype=torch.bfloat16,
)
print(model)
# 验证第一层权重是否迁移
loki_layer = model.model.layers[0].mlp.down_proj

pretrained_weight = original_model.model.layers[0].mlp.down_proj.weight

combined_weight = torch.zeros_like(pretrained_weight)
combined_weight[loki_layer.active_pos] = loki_layer.active_part.weight.data
combined_weight[loki_layer.fixed_pos] = loki_layer.fixed_part.weight.data
print(torch.allclose(combined_weight, pretrained_weight, atol=1e-6))  # 应输出 True

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


成功替换层0
成功替换层1
成功替换层2
成功替换层3
成功替换层4
成功替换层5
成功替换层6
成功替换层7
成功替换层8
成功替换层9
成功替换层10
成功替换层11
成功替换层12
成功替换层13
成功替换层14
成功替换层15
成功替换层16
成功替换层17
成功替换层18
成功替换层19
成功替换层20
成功替换层21
成功替换层22
成功替换层23
LoKIQwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0): Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): LoKILinear(
            (active_part): Linear(in_features=4864, out_features=6, bias=False)
            (fixed_part): Linear(in_feat

In [5]:
from src.loki.qwen_loki import LoKIQwen2ForCausalLM
import json
import torch
from transformers import AutoConfig, AutoModel, Qwen2Config

target_neurons_path = "target_neurons/Qwen2.5-0.5B-Instruct/5.json"

with open(target_neurons_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# custom_config = LoKIQwen2Config("Qwen/Qwen2.5-0.5B-Instruct", target_neurons=data)
model = LoKIQwen2ForCausalLM.from_pretrained(
    pretrained_model_name_or_path="Qwen/Qwen2.5-0.5B-Instruct",
    target_neurons=data,
    torch_dtype=torch.bfloat16,
)
LoKIQwen2ForCausalLM.register_for_auto_class("AutoModelForCausalLM")

# AutoModel.register(Qwen2Config, LoKIQwen2ForCausalLM)
# LoKIQwen2ForCausalLM.register_for_auto_class("AutoModel")
model.save_pretrained("/cache/models/custom-model")

成功替换层0
成功替换层1
成功替换层2
成功替换层3
成功替换层4
成功替换层5
成功替换层6
成功替换层7
成功替换层8
成功替换层9
成功替换层10
成功替换层11
成功替换层12
成功替换层13
成功替换层14
成功替换层15
成功替换层16
成功替换层17
成功替换层18
成功替换层19
成功替换层20
成功替换层21
成功替换层22
成功替换层23


# 自定义模型加载测试

In [4]:
from transformers import AutoModelForCausalLM
import json
import torch

target_neurons_path = "target_neurons/Qwen2.5-0.5B-Instruct/10.json"

with open(target_neurons_path, "r", encoding="utf-8") as f:
    data = json.load(f)
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="/cache/models/Qwen2.5-3B-Instruct",
    target_neurons=data,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
print(model)

TypeError: Qwen2ForCausalLM.__init__() got an unexpected keyword argument 'target_neurons'

# 自定义模型加载

In [None]:
from transformers import AutoModelForCausalLM,
import torch
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="/cache/models/custom-model-test",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
print(model)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


成功替换层0
成功替换层1
成功替换层2
成功替换层3
成功替换层4
成功替换层5
成功替换层6
成功替换层7
成功替换层8
成功替换层9
成功替换层10
成功替换层11
成功替换层12
成功替换层13
成功替换层14
成功替换层15
成功替换层16
成功替换层17
成功替换层18
成功替换层19
成功替换层20
成功替换层21
成功替换层22
成功替换层23
Parameter: model.layers.0.mlp.down_proj.active_weight, Shape: torch.Size([13, 4864])
Parameter: model.layers.1.mlp.down_proj.active_weight, Shape: torch.Size([27, 4864])
Parameter: model.layers.2.mlp.down_proj.active_weight, Shape: torch.Size([41, 4864])
Parameter: model.layers.3.mlp.down_proj.active_weight, Shape: torch.Size([55, 4864])
Parameter: model.layers.4.mlp.down_proj.active_weight, Shape: torch.Size([68, 4864])
Parameter: model.layers.5.mlp.down_proj.active_weight, Shape: torch.Size([82, 4864])
Parameter: model.layers.6.mlp.down_proj.active_weight, Shape: torch.Size([96, 4864])
Parameter: model.layers.7.mlp.down_proj.active_weight, Shape: torch.Size([110, 4864])
Parameter: model.layers.8.mlp.down_proj.active_weight, Shape: torch.Size([124, 4864])
Parameter: model.layers.9.mlp.down_proj.active_weig

# 加载后Model还原

In [1]:
from src.loki.loki_linear import restore_original_linears
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="/cache/models/custom-model-test",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
print(model)
model = restore_original_linears(model)
print(model)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


成功替换层0
成功替换层1
成功替换层2
成功替换层3
成功替换层4
成功替换层5
成功替换层6
成功替换层7
成功替换层8
成功替换层9
成功替换层10
成功替换层11
成功替换层12
成功替换层13
成功替换层14
成功替换层15
成功替换层16
成功替换层17
成功替换层18
成功替换层19
成功替换层20
成功替换层21
成功替换层22
成功替换层23
Parameter: model.layers.0.mlp.down_proj.active_weight, Shape: torch.Size([13, 4864])
Parameter: model.layers.1.mlp.down_proj.active_weight, Shape: torch.Size([27, 4864])
Parameter: model.layers.2.mlp.down_proj.active_weight, Shape: torch.Size([41, 4864])
Parameter: model.layers.3.mlp.down_proj.active_weight, Shape: torch.Size([55, 4864])
Parameter: model.layers.4.mlp.down_proj.active_weight, Shape: torch.Size([68, 4864])
Parameter: model.layers.5.mlp.down_proj.active_weight, Shape: torch.Size([82, 4864])
Parameter: model.layers.6.mlp.down_proj.active_weight, Shape: torch.Size([96, 4864])
Parameter: model.layers.7.mlp.down_proj.active_weight, Shape: torch.Size([110, 4864])
Parameter: model.layers.8.mlp.down_proj.active_weight, Shape: torch.Size([124, 4864])
Parameter: model.layers.9.mlp.down_proj.active_weig

In [1]:
from src.loki.tools import restore_loki_model
restore_loki_model(
    model_path="/cache/models/loki_reranker_qwen2_5-0-5b-10",
    target_neurons_path="target_neurons/Qwen2.5-0.5B-Instruct/10.json",
    output_path="/cache/models/loki_reranker_qwen2_5-0-5b-10_real",
)

TypeError: restore_loki_model() got an unexpected keyword argument 'output_path'