In [None]:
# === Shared Cache Bootstrap ===
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())
print(
    f"[GPU Memory] {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB"
    if torch.cuda.is_available()
    else "[CPU Mode]"
)


"""
🎯 Learning Objectives (學習目標):

1. **Preference Learning Theory (偏好學習理論)**:
   - DPO vs RLHF fundamental differences (DPO vs RLHF 基本差異)
   - Training complexity and stability comparison (訓練複雜度與穩定性對比)

2. **DPO Implementation (DPO 實作)**:
   - Direct preference optimization with TRL (使用 TRL 進行直接偏好優化)
   - QLoRA + DPO for low-VRAM training (低 VRAM 的 QLoRA + DPO 訓練)

3. **Preference Data Construction (偏好資料構建)**:
   - Create chosen/rejected pairs (建立 chosen/rejected 配對)
   - Quality assessment for preference learning (偏好學習的品質評估)

4. **Comparative Analysis (對比分析)**:
   - Performance vs computational cost (效能 vs 計算成本)
   - When to use DPO vs RLHF (何時使用 DPO vs RLHF)
"""

In [None]:
# ============================================================================
# 📦 Dependencies Installation
# ============================================================================

# Core ML and preference learning libraries
# !pip install transformers>=4.36 datasets accelerate bitsandbytes>=0.41
# !pip install trl>=0.7.0 peft>=0.7.0  # TRL for DPO training
# !pip install torch>=2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset, load_dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
import json, random, numpy as np
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
# ============================================================================
# 🧠 DPO vs RLHF Theory Comparison (理論對比)
# ============================================================================


def explain_preference_learning():
    """
    Explain the fundamental differences between DPO and RLHF
    解釋 DPO 與 RLHF 的基本差異
    """
    comparison = {
        "Aspect": [
            "Training Paradigm",
            "Complexity",
            "Stability",
            "Memory Usage",
            "Training Time",
            "Performance",
        ],
        "DPO": [
            "Direct optimization on preference pairs",
            "Simple, single-stage training",
            "More stable, no RL instability",
            "Lower (no critic model needed)",
            "Faster convergence",
            "Comparable to RLHF in many tasks",
        ],
        "RLHF": [
            "RL with reward model + PPO",
            "Complex, multi-stage (SFT→RM→PPO)",
            "Can be unstable due to RL",
            "Higher (actor + critic + ref models)",
            "Slower, more hyperparameter sensitive",
            "State-of-the-art when tuned well",
        ],
    }

    print("📊 DPO vs RLHF Comparison (DPO vs RLHF 對比)")
    print("=" * 80)
    for i, aspect in enumerate(comparison["Aspect"]):
        print(f"{aspect:20} | DPO: {comparison['DPO'][i]}")
        print(f"{' ' * 20} | RLHF: {comparison['RLHF'][i]}")
        print("-" * 80)

    return comparison


# Display the comparison
comparison_table = explain_preference_learning()

In [None]:
# ============================================================================
# 🗃️ Preference Dataset Preparation (偏好資料準備)
# ============================================================================


def create_chinese_preference_dataset(num_samples: int = 100) -> Dataset:
    """
    Create a synthetic Chinese preference dataset
    建立合成中文偏好資料集
    """

    # Sample prompts for Chinese assistant evaluation
    prompts = [
        "請解釋什麼是人工智慧？",
        "如何學習程式設計？",
        "推薦幾本好書給我",
        "請寫一個關於友情的短故事",
        "如何保持健康的生活方式？",
        "解釋量子物理的基本概念",
        "如何提升工作效率？",
        "描述你最喜歡的季節",
        "如何學習一門新語言？",
        "請給出投資建議",
    ]

    # Generate chosen (better) and rejected (worse) responses
    def generate_response_pair(prompt: str) -> Tuple[str, str]:
        """Generate chosen and rejected responses for a prompt"""

        # Better response characteristics: helpful, detailed, structured
        chosen_responses = {
            "請解釋什麼是人工智慧？": "人工智慧（AI）是指讓機器模擬人類智能的技術。它包含機器學習、深度學習、自然語言處理等分支。AI可以應用在圖像識別、語音處理、自動駕駛等領域，旨在解決複雜問題並提升效率。",
            "如何學習程式設計？": "學習程式設計建議按以下步驟：1) 選擇適合的語言（如Python）2) 掌握基本語法和概念 3) 多做練習項目 4) 參與開源項目 5) 持續學習新技術。建議從簡單項目開始，逐步提升難度。",
        }

        # Worse response characteristics: vague, unhelpful, or inappropriate
        rejected_responses = {
            "請解釋什麼是人工智慧？": "AI就是很厲害的電腦技術，很複雜，我也不太懂。",
            "如何學習程式設計？": "程式設計很難，你直接去補習班學比較快。",
        }

        # Use predefined responses or generate generic ones
        chosen = chosen_responses.get(
            prompt, f"這是一個很好的問題。{prompt}的答案需要從多個角度來分析..."
        )
        rejected = rejected_responses.get(prompt, f"不知道，你可以自己查資料。")

        return chosen, rejected

    # Build preference dataset
    preference_data = []
    for i in range(num_samples):
        prompt = random.choice(prompts)
        chosen, rejected = generate_response_pair(prompt)

        preference_data.append(
            {"prompt": prompt, "chosen": chosen, "rejected": rejected}
        )

    return Dataset.from_list(preference_data)


# Create preference dataset
print("🗃️ Creating Chinese Preference Dataset...")
preference_dataset = create_chinese_preference_dataset(num_samples=50)
print(f"Created {len(preference_dataset)} preference pairs")

# Display sample
print("\n📝 Sample Preference Pair:")
sample = preference_dataset[0]
print(f"Prompt: {sample['prompt']}")
print(f"Chosen: {sample['chosen'][:100]}...")
print(f"Rejected: {sample['rejected'][:100]}...")

In [None]:
# ============================================================================
# 🔧 Model and Tokenizer Setup (模型與分詞器設定)
# ============================================================================


def setup_model_for_dpo(
    model_name: str = "microsoft/DialoGPT-medium", use_4bit: bool = True
):
    """
    Setup model and tokenizer for DPO training
    為 DPO 訓練設置模型與分詞器
    """

    # Configure quantization for low VRAM
    if use_4bit:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
    else:
        bnb_config = None

    # Load model
    print(f"🤖 Loading model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer


# Setup model (using smaller model for demo)
model_name = "microsoft/DialoGPT-medium"  # Smaller model for demo
model, tokenizer = setup_model_for_dpo(model_name, use_4bit=True)

print(f"✅ Model loaded: {model_name}")
print(f"📏 Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")

In [None]:
# ============================================================================
# 🎯 DPO Training Implementation (DPO 訓練實作)
# ============================================================================


def prepare_model_for_dpo_training(model, tokenizer):
    """
    Prepare model for DPO training with LoRA
    準備模型進行 DPO 訓練（使用 LoRA）
    """

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration for DPO
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["c_attn", "c_proj"],  # For DialoGPT
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    # Add LoRA adapters
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model


# Prepare model for training
print("🔧 Preparing model for DPO training...")
model = prepare_model_for_dpo_training(model, tokenizer)


def format_preference_data_for_dpo(dataset: Dataset, tokenizer) -> Dataset:
    """
    Format preference dataset for DPO training
    為 DPO 訓練格式化偏好資料
    """

    def format_sample(examples):
        formatted = []
        for prompt, chosen, rejected in zip(
            examples["prompt"], examples["chosen"], examples["rejected"]
        ):
            formatted.append({"prompt": prompt, "chosen": chosen, "rejected": rejected})
        return formatted

    # Apply formatting
    formatted_dataset = dataset.map(
        lambda x: {"formatted": format_sample(x)},
        batched=True,
        remove_columns=dataset.column_names,
    )

    return formatted_dataset


# Format dataset for DPO
print("📋 Formatting preference dataset for DPO...")
formatted_dataset = preference_dataset.train_test_split(test_size=0.2, seed=42)


def train_dpo_model(model, tokenizer, train_dataset, eval_dataset=None):
    """
    Train model using Direct Preference Optimization
    使用直接偏好優化訓練模型
    """

    # DPO training configuration
    training_args = DPOConfig(
        output_dir="./dpo_output",
        num_train_epochs=1,  # Short for demo
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=5e-6,
        max_length=256,
        max_prompt_length=128,
        beta=0.1,  # DPO regularization parameter
        logging_steps=10,
        save_steps=50,
        eval_steps=50,
        warmup_ratio=0.1,
        remove_unused_columns=False,
        gradient_checkpointing=True,
        dataloader_drop_last=True,
        bf16=True if torch.cuda.is_bf16_supported() else False,
        fp16=True if not torch.cuda.is_bf16_supported() else False,
    )

    # Initialize DPO trainer
    dpo_trainer = DPOTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        beta=0.1,  # DPO regularization coefficient
    )

    print("🚀 Starting DPO training...")
    try:
        # Start training
        train_result = dpo_trainer.train()
        print("✅ DPO training completed!")

        # Save model
        dpo_trainer.save_model()
        print("💾 Model saved to ./dpo_output")

        return train_result

    except Exception as e:
        print(f"❌ Training failed: {e}")
        print("💡 This might be due to memory constraints or data formatting issues")
        return None


# Note: Actual training requires more memory and time
print("📝 DPO Training Setup Complete")
print("💡 To run actual training, uncomment the following lines:")
print(
    "# train_result = train_dpo_model(model, tokenizer, formatted_dataset['train'], formatted_dataset['test'])"
)

In [None]:
# ============================================================================
# 🆚 RLHF vs DPO Comparison Implementation (RLHF vs DPO 比較實作)
# ============================================================================


def simulate_rlhf_workflow():
    """
    Simulate RLHF workflow steps (for comparison)
    模擬 RLHF 工作流程步驟（用於比較）
    """

    print("🔄 RLHF Workflow Simulation:")
    print("=" * 50)

    steps = [
        ("1. Supervised Fine-tuning (SFT)", "Train base model on demonstration data"),
        ("2. Reward Model Training", "Train reward model on preference pairs"),
        ("3. PPO Training", "Optimize policy using PPO with reward model"),
        ("4. Iterative Refinement", "Multiple rounds of data collection and training"),
    ]

    for step, description in steps:
        print(f"{step}: {description}")

    print("\n📊 Resource Requirements Comparison:")

    resource_comparison = {
        "Training Stages": ["DPO: 1 stage", "RLHF: 3-4 stages"],
        "Models Required": [
            "DPO: Base + Reference",
            "RLHF: Base + Reward + Actor + Critic",
        ],
        "Memory Usage": ["DPO: ~2x base model", "RLHF: ~4x base model"],
        "Training Time": ["DPO: Hours", "RLHF: Days to weeks"],
        "Hyperparameter Sensitivity": ["DPO: Low", "RLHF: High"],
        "Implementation Complexity": ["DPO: Simple", "RLHF: Complex"],
    }

    for aspect, comparison in resource_comparison.items():
        print(f"{aspect:25}: {comparison[0]} vs {comparison[1]}")

    return resource_comparison


# Run RLHF simulation
rlhf_comparison = simulate_rlhf_workflow()

In [None]:
# ============================================================================
# 📊 Evaluation and Comparison (評估與比較)
# ============================================================================


def evaluate_preference_model(model, tokenizer, test_prompts: List[str]) -> Dict:
    """
    Evaluate model responses for preference learning effectiveness
    評估模型回應的偏好學習效果
    """

    print("📊 Evaluating model responses...")

    results = {
        "prompts": [],
        "responses": [],
        "response_lengths": [],
        "helpfulness_scores": [],
    }

    model.eval()

    for prompt in test_prompts:
        # Generate response
        inputs = tokenizer.encode(
            prompt, return_tensors="pt", max_length=128, truncation=True
        )

        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=100,
                do_sample=True,
                temperature=0.7,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response[len(prompt) :].strip()

        # Simple helpfulness scoring (length and content diversity)
        helpfulness_score = min(
            len(response.split()) / 20, 1.0
        )  # Normalize by word count

        results["prompts"].append(prompt)
        results["responses"].append(response)
        results["response_lengths"].append(len(response))
        results["helpfulness_scores"].append(helpfulness_score)

    return results


# Test prompts for evaluation
test_prompts = [
    "請解釋機器學習的基本概念",
    "如何提升學習效率？",
    "推薦一些好用的程式工具",
]

print("🧪 Running model evaluation...")
try:
    eval_results = evaluate_preference_model(model, tokenizer, test_prompts)

    print("\n📋 Evaluation Results:")
    for i, (prompt, response, score) in enumerate(
        zip(
            eval_results["prompts"],
            eval_results["responses"],
            eval_results["helpfulness_scores"],
        )
    ):
        print(f"\n{i+1}. Prompt: {prompt}")
        print(f"   Response: {response[:100]}...")
        print(f"   Helpfulness Score: {score:.2f}")

except Exception as e:
    print(f"⚠️ Evaluation failed: {e}")
    print("💡 This is expected with the demo setup")

In [None]:
# ============================================================================
# 📈 Best Practices and Recommendations (最佳實務與建議)
# ============================================================================


def provide_dpo_rlhf_recommendations():
    """
    Provide practical recommendations for choosing between DPO and RLHF
    提供選擇 DPO 或 RLHF 的實用建議
    """

    recommendations = {
        "Use DPO When": [
            "Limited computational resources (使用有限計算資源時)",
            "Need stable and reproducible training (需要穩定可重現的訓練時)",
            "Working with smaller models (<7B parameters) (使用較小模型時)",
            "Preference data is high-quality and sufficient (偏好資料品質高且充足時)",
            "Quick iteration and experimentation needed (需要快速迭代實驗時)",
        ],
        "Use RLHF When": [
            "Maximum performance is critical (最大效能至關重要時)",
            "Have abundant computational resources (有充足計算資源時)",
            "Working with large models (>13B parameters) (使用大型模型時)",
            "Can invest in reward model engineering (可投資獎勵模型工程時)",
            "Long-term production deployment planned (計劃長期生產部署時)",
        ],
        "Hybrid Approaches": [
            "Start with DPO for rapid prototyping (從 DPO 開始快速原型設計)",
            "Use DPO findings to inform RLHF design (用 DPO 發現來指導 RLHF 設計)",
            "Apply DPO for domain adaptation, RLHF for final tuning (用 DPO 做領域適應，RLHF 做最終調優)",
        ],
    }

    print("🎯 DPO vs RLHF Selection Guide")
    print("=" * 50)

    for category, items in recommendations.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  • {item}")

    return recommendations


# Display recommendations
recommendations = provide_dpo_rlhf_recommendations()

In [None]:
# ============================================================================
# 🏆 Chapter Summary and Key Takeaways (章節總結與關鍵要點)
# ============================================================================


def summarize_dpo_rlhf_learning():
    """
    Summarize key learnings from DPO vs RLHF comparison
    總結 DPO vs RLHF 比較的關鍵學習點
    """

    summary = {
        "Key Concepts Learned": [
            "DPO simplifies preference learning by removing RL complexity (DPO 通過移除 RL 複雜性簡化偏好學習)",
            "Preference data quality is crucial for both approaches (偏好資料品質對兩種方法都至關重要)",
            "Resource requirements differ significantly between DPO and RLHF (DPO 和 RLHF 的資源需求差異很大)",
            "Training stability varies between the two methods (兩種方法的訓練穩定性不同)",
        ],
        "Practical Skills Gained": [
            "Setting up DPO training with TRL and LoRA (使用 TRL 和 LoRA 設置 DPO 訓練)",
            "Creating and formatting preference datasets (建立和格式化偏好資料集)",
            "Comparing training paradigms for preference learning (比較偏好學習的訓練範式)",
            "Evaluating preference-trained models (評估偏好訓練的模型)",
        ],
        "Common Pitfalls": [
            "Insufficient preference data quality can harm both methods (偏好資料品質不足會損害兩種方法)",
            "DPO beta parameter requires careful tuning (DPO beta 參數需要仔細調優)",
            "Memory management crucial for multi-model RLHF setup (記憶體管理對多模型 RLHF 設置至關重要)",
            "Evaluation metrics for preference learning need careful design (偏好學習的評估指標需要仔細設計)",
        ],
    }

    print("📚 Chapter 24 Learning Summary")
    print("=" * 50)

    for category, points in summary.items():
        print(f"\n{category}:")
        for point in points:
            print(f"  ✓ {point}")

    return summary


# Display learning summary
learning_summary = summarize_dpo_rlhf_learning()

In [None]:
# ============================================================================
# 🧪 Smoke Test (驗收測試)
# ============================================================================


def run_dpo_rlhf_smoke_test():
    """
    Quick smoke test to verify DPO setup and concepts
    快速煙霧測試以驗證 DPO 設置和概念
    """

    print("🧪 Running DPO vs RLHF Smoke Test...")

    tests = []

    # Test 1: Model and tokenizer loaded
    try:
        assert model is not None and tokenizer is not None
        tests.append("✅ Model and tokenizer loaded successfully")
    except:
        tests.append("❌ Model/tokenizer loading failed")

    # Test 2: Preference dataset created
    try:
        assert len(preference_dataset) > 0
        assert "prompt" in preference_dataset.column_names
        assert "chosen" in preference_dataset.column_names
        assert "rejected" in preference_dataset.column_names
        tests.append("✅ Preference dataset created with correct format")
    except:
        tests.append("❌ Preference dataset creation failed")

    # Test 3: LoRA configuration applied
    try:
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in model.parameters())
        trainable_percentage = trainable_params / total_params * 100
        assert trainable_percentage < 5  # LoRA should have <5% trainable params
        tests.append(f"✅ LoRA applied ({trainable_percentage:.2f}% trainable)")
    except:
        tests.append("❌ LoRA configuration failed")

    # Test 4: Basic generation works
    try:
        test_prompt = "Hello"
        inputs = tokenizer.encode(test_prompt, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(inputs, max_new_tokens=10, do_sample=False)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        assert len(response) > len(test_prompt)
        tests.append("✅ Basic text generation functional")
    except:
        tests.append("❌ Text generation failed")

    # Display results
    print("\n📋 Test Results:")
    for test in tests:
        print(f"  {test}")

    success_rate = sum(1 for test in tests if test.startswith("✅")) / len(tests)
    print(f"\n🎯 Success Rate: {success_rate:.1%}")

    return success_rate >= 0.75


# Run smoke test
smoke_test_passed = run_dpo_rlhf_smoke_test()

print("\n" + "=" * 80)
print("🎉 Notebook 24 Complete: DPO vs RLHF Preference Learning")
print("=" * 80)

if smoke_test_passed:
    print("✅ All systems functional - ready for preference learning!")
else:
    print("⚠️  Some issues detected - review setup before proceeding")

print("\n📖 Next Steps:")
print("  • Try actual DPO training with more data and compute")
print("  • Experiment with different beta values in DPO")
print("  • Compare results with RLHF when resources allow")
print("  • Explore domain-specific preference learning (nb25)")

In [None]:
# ============================================================================
# 🧪 Final Acceptance Test (最終驗收測試)
# ============================================================================


def final_acceptance_test():
    """5-line smoke test for DPO vs RLHF comparison"""
    assert model and tokenizer, "Model/tokenizer setup failed"
    assert len(preference_dataset) > 0, "Preference dataset empty"
    assert (
        sum(p.numel() for p in model.parameters() if p.requires_grad)
        < sum(p.numel() for p in model.parameters()) * 0.1
    ), "LoRA not applied"
    print("✅ DPO setup complete - preference learning ready!")
    return True


final_acceptance_test()


## 6. 本章小結

### ✅ 完成項目 (Completed Items)
- **DPO 理論與實作**：完成 Direct Preference Optimization 的理論說明與訓練設置
- **偏好資料集構建**：建立中文偏好配對資料集，包含 chosen/rejected 回應
- **低資源訓練方案**：整合 QLoRA + DPO 實現低 VRAM 偏好學習
- **DPO vs RLHF 對比**：全面比較兩種偏好學習方法的優缺點
- **評估框架建立**：設計偏好學習效果的評估方法

### 🧠 核心原理要點 (Key Concepts)
- **DPO 簡化優勢**：透過直接優化偏好配對，避免 RLHF 的 RL 複雜性與不穩定性
- **資源需求差異**：DPO 僅需 ~2x 基礎模型記憶體，RLHF 需要 ~4x（actor/critic/reward/reference models）
- **訓練穩定性**：DPO 訓練更穩定，超參數敏感度較低
- **適用場景區分**：DPO 適合資源受限與快速迭代，RLHF 適合追求極致效能
- **Beta 參數重要性**：DPO 的 beta 參數控制正則化強度，需要仔細調優

### ⚠️ 常見坑點 (Common Pitfalls)
- **偏好資料品質**：低品質的 chosen/rejected 配對會嚴重影響兩種方法的效果
- **記憶體管理**：即使是 DPO 也需要載入 reference model，需注意記憶體配置
- **評估困難性**：偏好學習的效果評估比傳統監督學習更具挑戰性
- **過度優化風險**：DPO 可能過度適應訓練資料中的偏好模式

### 🚀 下一步建議 (Next Steps)
1. **領域特定微調 (nb25)**：將偏好學習應用到特定領域（醫療/法律/金融）
2. **進階評估方法**：實作更複雜的偏好學習評估指標
3. **混合策略探索**：嘗試 DPO + SFT 或 DPO + 小規模 RLHF 的組合方法
4. **生產部署準備**：優化推理效能並準備實際部署環境

---

**階段性里程碑：Part D (Fine-tuning) 接近完成！**

我們已經完成了 Fine-tuning 階段的核心技術：
- ✅ LoRA 微調 (nb20)
- ✅ QLoRA 低 VRAM 訓練 (nb21) 
- ✅ Adapters/Prefix Tuning (nb22)
- ✅ 資料集整理與清洗 (nb23)
- ✅ **DPO vs RLHF 偏好學習 (nb24)** ← 剛完成
- 🔄 領域特定微調 (nb25) ← 下一個

完成 nb25 後，我們將進入 **Part E: RAG × Agents (高階應用)** 階段，整合前面學習的所有技術！