In [20]:
# 加载模型并测试
from transformers import AutoTokenizer, AutoModelForCausalLM

# 指定模型路径，这里是一个本地已经下载好的 DeepSeek-R1 模型的路径
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

print("模型加载成功！")

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

模型加载成功！


In [21]:
import json

# 假设这是你的 50 条样本数据
#samples = [...]  # 每个 sample 应为 dict 类型, 例如 {"text": "xxx"} 或 {"input": "...", "output": "..."}
samples = []

with open("distill_psychology-10k-r1.json", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        samples.append({
            "prompt": item["input"],
            "completion": item["content"]
        })

print(f"Loaded {len(samples)} samples")
print(f"First sample prompt: {samples[0]['prompt'][:50]}...")

# 写入 jsonl 文件
with open("dataset.jsonl", "w", encoding="utf-8") as f:
  for sample in samples:
     f.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("数据集制作完成！")

Loaded 8775 samples
First sample prompt: 我晚上难以入睡，我认为这是因为我对工作感到压力...
数据集制作完成！


In [22]:
# 拆分数据集
from datasets import load_dataset

# 加载本地数据
dataset = load_dataset("json", data_files={"train": "dataset.jsonl"}, split="train")

print("数据总数量: ", len(dataset))

# 划分训练集和测试集 (90% 训练, 10% 测试)
train_test_split = dataset.train_test_split(test_size=0.1)

# 提取训练集和验证集
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"train dataset len: {len(train_dataset)}")
print(f"test dataset len : {len(eval_dataset)}")
print("训练数据的准备工作完成")


Generating train split: 0 examples [00:00, ? examples/s]

数据总数量:  8775
train dataset len: 7897
test dataset len : 878
训练数据的准备工作完成


In [23]:
def tokenizer_function(many_samples):
    """
    将 prompt 和 completion 拼接后进行分词处理
    """
    # 将每条样本的 prompt 和 completion 拼接成一个文本
    texts = [f"{prompt}\n{completion}" for prompt, completion in zip(many_samples["prompt"], many_samples["completion"])]

    # 使用 tokenizer 进行分词, 截断长度为 512, 填充至最大长度
    tokens = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    # 设置 labels 为 input_ids 的副本 (用于因果语言建模任务)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train_dataset = train_dataset.map(tokenizer_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenizer_function, batched=True)

Map:   0%|          | 0/7897 [00:00<?, ? examples/s]

Map:   0%|          | 0/878 [00:00<?, ? examples/s]

In [24]:
print(tokenized_eval_dataset)

Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 878
})


In [25]:
!pip install -U bitsandbytes>=0.46.1

In [26]:
# 量化设置
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# 配置 8bit 量化
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# 重新加载量化后的模型
model = AutoModelForCausalLM.from_pretrained(
      model_name,
      quantization_config=quantization_config,
      device_map="auto"
  )

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

In [27]:
# 配置 LoRA 参数
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
      r=8,             # LoRA 秩 (rank), 控制适配器大小, 通常设为 8~32
      lora_alpha=16,         # 控制 LoRA 更新的缩放因子, 一般为 r 的倍数
      lora_dropout=0.05,        # Dropout 概率, 防止过拟合
      task_type=TaskType.CAUSAL_LM   # 任务类型: 因果语言模型
  )

# 获取 PEFT 模型
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 1,778,177,536 || trainable%: 0.0613


In [28]:
# 开始训练
from transformers import TrainingArguments, Trainer

# 配置训练参数
training_args = TrainingArguments(
      output_dir="./finetuned_models",       # 模型保存路径
      num_train_epochs=1,                   # 训练轮数
      per_device_train_batch_size=4,         # 每设备批量大小 (GPU 上)
      gradient_accumulation_steps=8,         # 梯度累积步数 (模拟更大 batch)
      fp16=True,                             # 使用 FP16 半精度训练, 节省显存
      logging_steps=10,                      # 每 10 步打印一次日志
      save_steps=100,                        # 每 100 步保存一次 checkpoint
      eval_strategy="steps",                 # 每隔一定步数评估一次
      eval_steps=10,                         # 每 10 步进行一次评估
      learning_rate=3e-5,                    # 学习率
      logging_dir="./logs",                  # 日志保存路径
      run_name="deepseek"
)
print("训练参数设置完毕")

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


训练参数设置完毕


In [None]:
# 定义训练器
trainer = Trainer(
      model=model,                              # 已加载并配置好 LoRA 的模型
      args=training_args,                       # 训练参数
      train_dataset=tokenized_train_dataset,    # 分词后的训练数据集
      eval_dataset=tokenized_eval_dataset       # 分词后的验证数据集
)

print("------开始训练------")
trainer.train()
print("------训练完成------")

------开始训练------




Step,Training Loss,Validation Loss
10,3.58821,3.592671
20,3.585817,3.566569
30,3.632222,3.534918


In [None]:
# 保存 LoRA 适配器
model.save_pretrained("./finetuned_models/lora_adapter")
tokenizer.save_pretrained("./finetuned_models/lora_adapter")
print("LoRA 适配器保存完成")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 重新加载基础模型 (全精度, 用于合并)
base_model = AutoModelForCausalLM.from_pretrained(
      "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
      device_map="auto"
)

# 加载 LoRA 适配器并合并
# merged_model = PeftModel.from_pretrained(base_model, "./finetuned_models/lora_adapter")
merged_model = PeftModel.from_pretrained(base_model, "./finetuned_models/checkpoint-247")
merged_model = merged_model.merge_and_unload()

# 保存合并后的完整模型
merged_model.save_pretrained("./finetuned_models/merged_model")
tokenizer.save_pretrained("./finetuned_models/merged_model")
print("模型合并完成")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载合并后的模型
model = AutoModelForCausalLM.from_pretrained("./finetuned_models/merged_model").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("./finetuned_models/merged_model")

# 测试
prompt = "我最近感到非常愤怒，但不知道原因是什么。"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
  **inputs,
  max_new_tokens=512,
  temperature=0.7,
  top_p=0.9,
  do_sample=True
)

response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(f"问: {prompt}")
print(f"答: {response}")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载合并后的模型
model = AutoModelForCausalLM.from_pretrained("./finetuned_models/merged_model").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("./finetuned_models/merged_model")
print("模型加载成功！输入 'quit' 退出\n")

while True:
    prompt = input("问: ")
    if prompt.strip().lower() == "quit":
        print("再见！")
        break

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    print(f"答: {response}\n")