In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
class LoRALinear(nn.Module):
    def __init__(self, original_layer, rank=8, alpha=16):
        super().__init__()
        self.base_layer = original_layer
        self.rank = rank
        self.alpha = alpha
        
        # 冻结原始注意力层的参数
        for param in self.base_layer.parameters():
            param.requires_grad = False
            
        in_dim = self.base_layer.weight.shape[1]
        out_dim = self.base_layer.weight.shape[0]
        base_dtype = self.base_layer.weight.dtype
        base_device = self.base_layer.weight.device

        # 明确指定 dtype 和 device
        self.lora_A = nn.Linear(in_dim, rank, bias=False).to(dtype=base_dtype, device=base_device)
        self.lora_B = nn.Linear(rank, out_dim, bias=False).to(dtype=base_dtype, device=base_device)
        
        # 初始化权重
        nn.init.kaiming_uniform_(self.lora_A.weight, a=np.sqrt(5))
        nn.init.zeros_(self.lora_B.weight)
        
    def forward(self, x):
        # 原始前向传播
        out = self.base_layer(x)
        # LoRA部分
        lora_out = self.lora_B(self.lora_A(x))
        # 缩放并合并
        return out + (self.alpha / self.rank) * lora_out

In [4]:
model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [5]:
def replace_linear_with_lora(model, target_name, rank=8, alpha=16):
    for name, module in model.named_children():
        if len(list(module.children())) > 0:  # 如果模块还有子模块，递归进入
            replace_linear_with_lora(module, target_name, rank, alpha)
        
        if isinstance(module, nn.Linear) and name == target_name:
            # 替换目标 Linear 层为 LoRALinear
            lora_layer = LoRALinear(module, rank=rank, alpha=alpha)
            setattr(model, name, lora_layer)

def apply_lora_model(model,target_modules,rank=8,alpha=16):
    for param in model.parameters():
        param.requires_grad = False
    
    for target_name in target_modules:
        replace_linear_with_lora(model,target_name,rank,alpha)
        
        # 计算可训练参数数量
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # 计算总参数数量
    total_params = sum(p.numel() for p in model.parameters())
    # 计算可训练参数占比（百分比）
    trainable_percent = (trainable_params / total_params) * 100
    
    print(
        f"trainable params: {trainable_params:,} || "
        f"all params: {total_params:,} || "
        f"trainable%: {trainable_percent:.4f}"
    )

In [6]:
apply_lora_model(model,["q_proj"])

trainable params: 688,128 || all params: 596,738,048 || trainable%: 0.1153


In [7]:
print(model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): LoRALinear(
            (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
            (lora_A): Linear(in_features=1024, out_features=8, bias=False)
            (lora_B): Linear(in_features=8, out_features=2048, bias=False)
          )
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down

In [8]:
print(model.model.layers[0].self_attn.q_proj.lora_A.weight[0][0].requires_grad)

True


In [9]:
from datasets import load_dataset

# 加载GSM8K数据集
dataset = load_dataset("openai/gsm8k", "main")

# 预处理函数 - 根据你的任务需求调整
def preprocess_function(examples):
    # 这里以数学问题解答为例，构建输入输出格式
    inputs = [f"Question: {q}\nAnswer:" for q in examples["question"]]
    outputs = examples["answer"]
    return {"input": inputs, "output": outputs}

# 预处理数据集
dataset = dataset.map(preprocess_function, batched=True)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [10]:
print(dataset['train'][0])

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'input': 'Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nAnswer:', 'output': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}


In [11]:
from transformers import TrainingArguments, Trainer

# 数据预处理函数
def tokenize_function(examples):
    # 将输入和输出拼接起来
    texts = [inp + " " + out for inp, out in zip(examples["input"], examples["output"])]
    # 对文本进行tokenize
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=256)
    # 创建labels - 将input部分的token设为-100，模型不会计算这些位置的loss
    input_ids = tokenized["input_ids"]
    labels = []
    for i in range(len(input_ids)):
        input_len = len(tokenizer(examples["input"][i], truncation=True, max_length=256)["input_ids"])
        labels.append([-100] * input_len + input_ids[i][input_len:])
    tokenized["labels"] = labels
    return tokenized

# 对数据集进行tokenize
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [12]:
# 训练参数
training_args = TrainingArguments(
    output_dir="./qwen-mylora-gsm8k",
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=1000,
    logging_steps=100,
    fp16=True,
    report_to=None,
    push_to_hub=False,  
)

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
# 训练模型
trainer.train()

# 保存模型
model.save_pretrained("qwen-mylora-gsm8k")

Step,Training Loss,Validation Loss


In [None]:
# 1. 先加载原始模型
model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cpu"
)

# 2. 用你的LoRA类替换所有q_proj层
apply_lora_model(model,["q_proj"])



trainable params: 688,128 || all params: 596,738,048 || trainable%: 0.1153


In [None]:
from safetensors.torch import load_file
# 如果是Trainer保存的checkpoint，权重在 pytorch_model.bin 里
state_dict = load_file("D:\AI\ML\MLSys Final\LORA_Reconstruct\qwen-mylora-gsm8k\checkpoint-5607\model.safetensors")
model.load_state_dict(state_dict, strict=False)# strict=False可以防止小量不匹配

  state_dict = load_file("D:\AI\ML\MLSys Final\LORA_Reconstruct\qwen-mylora-gsm8k\checkpoint-5607\model.safetensors")


_IncompatibleKeys(missing_keys=['lm_head.weight'], unexpected_keys=[])

In [None]:
def generate_answer(question):
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 测试
question = "If a book costs $15 and the sales tax is 8%, what is the total cost?"
print(generate_answer(question))

Question: If a book costs $15 and the sales tax is 8%, what is the total cost?
Answer: The sales tax is 15 x 8/100 = $<<15*8/100=1.2>>1.2
The total cost is 15 + 1.2 = $<<15+1.2=16.2>>16.2
#### 16.2
