In [5]:
from transformers import AutoConfig, AutoModel
from accelerate import init_empty_weights

model_name = "models/loki_qwen2.5_0.5b_10"
config = AutoConfig.from_pretrained(model_name)

with init_empty_weights():
    model = AutoModel.from_config(config, trust_remote_code=True)

print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n总参数量: {total_params:,}")
print(f"可训练参数量: {trainable_params:,}")
print(f"可训练参数比例: {trainable_params/total_params*100:.2f}%")

print("\n可训练模块:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  {name}: {param.numel():,} 参数")


LoKIQwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): LoKILinear(
            in_features=4864, out_features=896, active_neurons=89 (9.9%)
            (active): Linear(in_features=4864, out_features=89, bias=False)
            (frozen): Linear(in_features=4864, out_features=807, bias=False)
          )
          (act_fn): SiLUActivation()
        )
        (