In [1]:
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer
import torch


  from .autonotebook import tqdm as notebook_tqdm


# 1、导入数据

In [2]:
# 1、导入alpaca_zh数据集
alpaca_data = load_dataset(r"D:\datasets\alpaca_zh",split="train")
# 取前1000条数据
subset = alpaca_data.take(1000)
subset


Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1000
})

In [3]:
# 2、训练测试集划分
ds = subset.train_test_split(test_size=0.1,seed=42)
ds
del subset

In [4]:
# check
ds["train"][1]

{'instruction': '写一份关于“自我提升”主题的摘要。',
 'input': '',
 'output': '自我提升是积极努力在生活的各个方面变得更好的过程。它涉及制定目标和行动计划来实现这些目标，以及培养自律、适应力和成长型思维的态度和习惯。自我提升的好处包括更强的信心、更深入的自我理解以及能够实现自己的愿景并创造更美好的生活的能力。'}

# 数据预处理

In [5]:
model_path= r"D:\LLM\models_zoo\Qwen3-0.6B"
# 自动选择低精度以适配 GPU 显存
prefer_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
prefer_fp16 = torch.cuda.is_available() and not prefer_bf16

# tokenizer 放 CPU 即可
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
def process_func(datasets,tokenizer,max_length=512):
    # input str
    combined_input = "user:\n"+datasets["instruction"] +"\n\n"+datasets["input"]+"\n\nassistant:\n"
    # output str
    output= datasets["output"]+tokenizer.eos_token
    
    # str -> token ids
    embeddings = tokenizer(combined_input+output,padding=True,truncation=True,max_length=max_length,return_tensors="pt")
    
    # 计算用户输入的长度
    input_len = len(tokenizer(combined_input,padding=True,truncation=True,max_length=max_length,return_tensors="pt"))

    labels = embeddings["input_ids"].clone()
    labels[:,:input_len] = -100
    
    return {
        "input_ids":embeddings["input_ids"].squeeze(0),
        "attention_mask":embeddings["attention_mask"].squeeze(0),
        "labels":labels.squeeze(0)
    }

In [7]:
newds = ds.map(process_func,remove_columns=["instruction","input","output"],fn_kwargs={"tokenizer":tokenizer})
del ds
newds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

# 加载模型

In [8]:
from transformers import AutoModelForCausalLM

# 自动将模型放入 GPU；并使用更低精度节省显存
model_dtype = torch.bfloat16 if prefer_bf16 else (torch.float16 if prefer_fp16 else torch.float32)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=model_dtype,
    device_map="auto"
)
model

`torch_dtype` is deprecated! Use `dtype` instead!


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [9]:
# (check)模型总参数量
total_params =sum([param.numel() for param in model.parameters()])

print(f"模型总参数量: {total_params/1e6:.2f}M")

# 打印模型参数名字，方便后续冻结
for name,param in model.named_parameters():
    print(name)


模型总参数量: 596.05M
model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.q_norm.weight
model.layers.0.self_attn.k_norm.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.self_attn.q_norm.weight
model.layers.1.self_attn.k_norm.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_pro

In [10]:
# 解冻最后一层
tranable_params = 0
freeze_params = 0
for name,param in model.named_parameters():
    if "model.layers.27" in name or  "model.norm" in name:
        param.requires_grad = True
        tranable_params += param.numel()
    else:
        param.requires_grad = False
        freeze_params += param.numel()

print(f"tranable_params:{tranable_params/1e6:.2f}M")
print(f"freeze_params:{freeze_params/1e6:.2f}M")

tranable_params:15.73M
freeze_params:580.32M


# 训练

In [11]:
from transformers import TrainingArguments


train_args = TrainingArguments(
    output_dir="../outputs/freeze_qwen3_0.6B",
    per_device_eval_batch_size=8,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    report_to=["tensorboard"],
    # GPU/AMP 设置
    fp16=prefer_fp16,
    bf16=prefer_bf16,
    dataloader_pin_memory=True,
    gradient_accumulation_steps=1,
    torch_compile=False
)

In [12]:
from transformers import DataCollatorForSeq2Seq, Trainer


trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=newds["train"],
    eval_dataset=newds["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True)
)

In [13]:
trainer.train()



Step,Training Loss
10,3.261
20,3.3656
30,3.1003
40,2.9423
50,2.8817
60,2.8745
70,2.8132
80,2.7356
90,2.8875
100,2.7368


TrainOutput(global_step=675, training_loss=2.5646367447464553, metrics={'train_runtime': 1706.1439, 'train_samples_per_second': 1.583, 'train_steps_per_second': 0.396, 'total_flos': 929051576893440.0, 'train_loss': 2.5646367447464553, 'epoch': 3.0})