In [1]:
# !pip install torch transformers datasets peft accelerate

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
# 指定本地路径
local_model_path = './gpt2'

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
# GPT-2 没有 pad_token，沿用旧写法
tokenizer.pad_token = tokenizer.eos_token

# 加载本地文件作为训练集
from datasets import concatenate_datasets

# 读出三个 split
ds = load_dataset(
    "json",
    data_files={
        "train": "dataset/qingyun.jsonl",
        "valid": "dataset/tieba.jsonl",
        "test":  "dataset/chat.jsonl"
    }
)

# 合并成一份大训练集
all_train_ds = concatenate_datasets([ds["train"]])

In [4]:
all_train_ds[10]

{'query': '说不粗来', 'response': '乱什么呢?'}

In [5]:
# 配置lora
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r = 8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=['c_attn', 'c_proj']
)

In [6]:
MAX_LEN = 256
def tokenize(batch):
    texts = [f"问：{q} \n答：{r}" for q, r in zip(batch["query"], batch["response"])]
    out = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    out["labels"] = out["input_ids"].copy()
    return out

all_train_ds = all_train_ds.map(tokenize, batched=True, 
                                remove_columns=all_train_ds.column_names)

Map:   0%|          | 0/117528 [00:00<?, ? examples/s]

In [7]:
# 加载本地模型
model = AutoModelForCausalLM.from_pretrained(local_model_path)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475




In [8]:
args = TrainingArguments(
    output_dir='./lora_gpt2',           # 保存路径
    per_device_train_batch_size=2,      # 单个核心上的batch_size
    gradient_accumulation_steps=4,      # 梯度累计步数，实际的全局B = 2*4
    num_train_epochs=1,                 # 训练周期lora建议1-3（防止过拟合）
    learning_rate=1e-3,                 # 优化器的学习率（默认AdamW：1e-4 ~ 1e-5）
    fp16=True,                          # 混合精度训练（前向反向用float16）
)

In [9]:
# 训练
trainer = Trainer(model = model, args=args, 
    train_dataset=all_train_ds, data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False))
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.2122
1000,1.9497
1500,1.8516
2000,1.761
2500,1.7208
3000,1.6841
3500,1.6391
4000,1.6398
4500,1.6212
5000,1.6165


TrainOutput(global_step=14691, training_loss=1.5912364554384115, metrics={'train_runtime': 8968.9404, 'train_samples_per_second': 13.104, 'train_steps_per_second': 1.638, 'total_flos': 1.5500970572120064e+16, 'train_loss': 1.5912364554384115, 'epoch': 1.0})

In [19]:
# 如果前面已经 import 过，下面这一行可省
from transformers import AutoTokenizer

# 1. 构造 prompt
prompt = "用户：请问你是谁\n助手："

# 2. 编码
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}   # 确保与模型同设备

# 3. 生成
out = model.generate(
    **inputs,
    max_new_tokens=32,
    do_sample=True,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id   # 防止警告
)

# 4. 解码打印
print(tokenizer.decode(out[0], skip_special_tokens=True))

用户：请问你是谁
助手：走长空默银和人！嘎嘎嘎�


In [3]:
import latexify
import math

@latexify.get_latex
def solve(a, b, c):
    return (-b + math.sqrt(b**2 - 4*a*c)) / (2*a)

solve  # 直接输出单元格，会显示 LaTeX 公式

'\\mathrm{solve}(a, b, c) = \\frac{-b + \\sqrt{ b^{2} - 4 a c }}{2 a}'

$$
\begin{algorithmic}[1]
\Procedure{Euclid}{$a, b$} \Comment{计算最大公约数}
\State $r \gets a \bmod b$
\While{$r \neq 0$}
\State $a \gets b$
\State $b \gets r$
\State $r \gets a \bmod b$
\EndWhile
\State \textbf{return} $b$
\EndProcedure
\end{algorithmic}
$$