# 1 数据预处理

In [2]:
import pandas as pd
import json


csv = pd.read_csv("./data/外科5-14000.csv", encoding="gbk", encoding_errors="ignore")
csv.info()

# 读取数据并且组织为标准型式
data = []
for _, row in csv.iterrows():
    data.append({
        "instruction": row["title"],
        "input": row["ask"],
        "output": row["answer"]
    })

# 保存文件为jsonl格式
with open("sft_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115991 entries, 0 to 115990
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   department  115991 non-null  object
 1   title       115991 non-null  object
 2   ask         115991 non-null  object
 3   answer      115991 non-null  object
dtypes: object(4)
memory usage: 3.5+ MB


# 2 数据加载

In [1]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="sft_dataset.jsonl")

# 3 模型加载

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
model.to(device)
# model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
def preprocess_function(examples):
    inputs = [f"Instruction: {i}\nInput: {j}" for i, j in zip(examples["instruction"], examples["input"])]
    # 将 output 作为标签
    labels = examples["output"]
    # 对输入和标签进行分词
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(labels, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 4 模型训练

In [7]:
from transformers import TrainingArguments, Trainer

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    # fp16=True,  # 使用混合精度训练
)

# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    processing_class=tokenizer,
)

# 开始微调
trainer.train()

Step,Training Loss
100,1.6275
200,1.5377
300,1.5057
400,1.4887
500,1.5373
600,1.5149
700,1.4667
800,1.4131
900,1.3988
1000,1.4465


KeyboardInterrupt: 

In [6]:
torch.cuda.empty_cache()