In [1]:
"""
範例：GPT-2 + LoRA + 自定義 PPO 損失 (Self-motivated Learning)
修正：將 generate() 的 max_length 改為 max_new_tokens，以免 prompt 長度超過 max_length。
"""

import os
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model

# -----------------------------
# 1. 準備自定義 Dataset
# -----------------------------
class QADataset(Dataset):
    def __init__(self, data_list):
        self.data_list = []
        for item in data_list:
            q = str(item.get("question", ""))
            a = str(item.get("answer", ""))
            # 若要過濾空值，可自行加判斷
            if len(q.strip()) > 0 and len(a.strip()) > 0:
                self.data_list.append({"question": q, "answer": a})

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

def qa_data_collator(features):
    batch_questions = [f["question"] for f in features]
    batch_answers = [f["answer"] for f in features]
    return {"question": batch_questions, "answer": batch_answers}

data_list = [
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "Who wrote the play Romeo and Juliet?", "answer": "William Shakespeare"},
    {"question": "What is 2 + 2?", "answer": "4"},
]
train_dataset = QADataset(data_list)

# -----------------------------
# 2. 載入預訓練模型與 Tokenizer
# -----------------------------
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True
)

# -----------------------------
# 3. 設定 LoRA 微調參數，並套用至模型
# -----------------------------
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)
print("模型已套用 LoRA，總參數數量:", sum(p.numel() for p in model.parameters()))

# -----------------------------
# 4. 定義簡易 Reward 與 PPO Loss
# -----------------------------
def simple_reward_function(generated_answer, ground_truth):
    return 1.0 if generated_answer.strip().lower() == ground_truth.strip().lower() else 0.0

def compute_ppo_loss(model, question, ground_truth, tokenizer, generation_kwargs):
    # 先生成 reasonale
    input_ids = tokenizer.encode(question, return_tensors="pt").to(model.device)
    cot_ids = model.generate(
        input_ids,
        max_new_tokens=50,  # 改用 max_new_tokens
        do_sample=True,
        **generation_kwargs
    )
    reasonale = tokenizer.decode(cot_ids[0], skip_special_tokens=True)

    # 組成 prompt: [question + reasonale]
    prompt = question + "\n" + reasonale
    prompt_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    answer_ids = model.generate(
        prompt_ids,
        max_new_tokens=50,  # 改用 max_new_tokens
        do_sample=True,
        **generation_kwargs
    )
    generated_answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    # 計算 reward
    reward = simple_reward_function(generated_answer, ground_truth)

    # 計算交叉熵損失
    outputs = model(prompt_ids, labels=prompt_ids)
    ce_loss = outputs.loss

    # PPO (示意)
    baseline = 0.5
    advantage = reward - baseline
    epsilon = 0.2
    ratio = torch.tensor(1.0).to(model.device)
    clipped_ratio = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
    ppo_loss = -torch.min(ratio * advantage, clipped_ratio * advantage)

    total_loss = ce_loss + ppo_loss
    return total_loss

# -----------------------------
# 5. 自定義 Trainer
# -----------------------------
class SelfMotivatedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        questions = inputs["question"]
        answers = inputs["answer"]

        generation_kwargs = {"temperature": 0.7, "top_p": 0.9}

        total_loss = 0.0
        batch_size = len(questions)
        for q, a in zip(questions, answers):
            total_loss += compute_ppo_loss(model, str(q), str(a), tokenizer, generation_kwargs)

        final_loss = total_loss / batch_size
        return (final_loss, None) if return_outputs else final_loss

# -----------------------------
# 6. 定義訓練參數
# -----------------------------
training_args = TrainingArguments(
    output_dir="./lora_self_motivated_finetune",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    logging_steps=5,
    save_steps=10,
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
)

# -----------------------------
# 7. 建立 Trainer 實例並訓練
# -----------------------------
trainer = SelfMotivatedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=qa_data_collator,
)

print("開始訓練，使用 self-motivated learning 與 PPO 機制...")
trainer.train()

# -----------------------------
# 8. 儲存模型與 Tokenizer
# -----------------------------
trainer.save_model("./lora_self_motivated_finetune/final_model")
tokenizer.save_pretrained("./lora_self_motivated_finetune/final_model")

print("微調完成，模型儲存於 ./lora_self_motivated_finetune/final_model")




模型已套用 LoRA，總參數數量: 124734720
開始訓練，使用 self-motivated learning 與 PPO 機制...


  0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

{'train_runtime': 8.3324, 'train_samples_per_second': 0.72, 'train_steps_per_second': 0.48, 'train_loss': 2.6507771015167236, 'epoch': 2.0}
微調完成，模型儲存於 ./lora_self_motivated_finetune/final_model


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# 1. 指定微調後模型的路徑
lora_model_path = "./lora_self_motivated_finetune/final_model"

# 2. 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_model_path)

# 3. 載入原始 base model（與你微調時用的相同，如 "gpt2"）
base_model = AutoModelForCausalLM.from_pretrained("gpt2")

# 4. 利用 PEFT 的 API，將 LoRA 權重載入 base model
model = PeftModel.from_pretrained(base_model, lora_model_path)

# 5. 切換到 GPU（若可用）或保持在 CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 6. 測試：做一個簡單的推理
prompt = "What is the capital of France?\nThe reasonale is that"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# 這裡示範用 max_new_tokens，而非 max_length，避免 prompt 太長時的錯誤
outputs = model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,     # 是否使用隨機采樣
    top_p=0.9,          # nucleus sampling
    temperature=0.7     # 溫度
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:\n", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text:
 What is the capital of France?
The reasonale is that it is the most comprehensive of all the capital markets in the world. It's the only one in which the capital is concentrated in the hands of one group of investors. It's the only one which has a capital of its own.
We believe that capital can be made up of all kinds of things, including all sorts of things that could be made up of different kinds of capital. That's what we have in the capital market. We believe that it's the best capital in the world.
