In [None]:
# ===================================================================
# Cell 1: Shared Cache Bootstrap & Environment Setup
# ===================================================================
import os, pathlib, torch, gc
import warnings

warnings.filterwarnings("ignore")

# Shared cache setup - MANDATORY for all notebooks
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
cache_paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}

for key, path in cache_paths.items():
    os.environ[key] = path
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )

In [None]:
# ===================================================================
# Cell 2: Memory Profiling & System Requirements Check
# ===================================================================
def check_system_requirements():
    """Check GPU memory and recommend configurations"""
    if not torch.cuda.is_available():
        print("⚠️  No GPU detected - falling back to CPU (very slow)")
        return {"device": "cpu", "max_batch_size": 1}

    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"🎯 GPU Memory: {gpu_memory_gb:.1f} GB")

    if gpu_memory_gb >= 16:
        config = {
            "model_name": "Qwen/Qwen2.5-7B-Instruct",
            "max_batch_size": 4,
            "max_seq_length": 2048,
            "gradient_accumulation_steps": 2,
        }
        print("✅ High-end GPU: Can use 7B model with comfortable batch size")
    elif gpu_memory_gb >= 12:
        config = {
            "model_name": "Qwen/Qwen2.5-7B-Instruct",
            "max_batch_size": 2,
            "max_seq_length": 1536,
            "gradient_accumulation_steps": 4,
        }
        print("✅ Mid-range GPU: 7B model with reduced batch size")
    elif gpu_memory_gb >= 8:
        config = {
            "model_name": "Qwen/Qwen2.5-7B-Instruct",
            "max_batch_size": 1,
            "max_seq_length": 1024,
            "gradient_accumulation_steps": 8,
        }
        print("⚠️  Low-end GPU: Minimal settings, expect slower training")
    else:
        print("❌ Insufficient GPU memory (<8GB) - CPU fallback recommended")
        return {"device": "cpu", "max_batch_size": 1}

    config["device"] = "cuda"
    return config


system_config = check_system_requirements()

In [None]:
# ===================================================================
# Cell 3: QLoRA Configuration & Dependencies Installation
# ===================================================================
# Install required packages (run once)
"""
pip install transformers>=4.36.0 datasets accelerate peft bitsandbytes>=0.41.0
pip install wandb tensorboard  # Optional: for logging
"""

import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
from typing import Dict, List

print(f"Transformers version: {transformers.__version__}")

# QLoRA 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Nested quantization for better accuracy
    bnb_4bit_quant_type="nf4",  # NormalFloat4 - optimal for neural networks
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computations
)

# LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=16,  # Low-rank dimension (higher = more capacity, more memory)
    lora_alpha=32,  # LoRA scaling parameter (typically 2*r)
    target_modules=[
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Qwen2.5 attention & MLP layers
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # Don't adapt bias parameters
    task_type="CAUSAL_LM",  # Causal language modeling task
)

print("✅ QLoRA configuration ready")

In [None]:
# ===================================================================
# Cell 4: Model & Tokenizer Loading with Memory Optimization
# ===================================================================
model_name = system_config.get("model_name", "Qwen/Qwen2.5-7B-Instruct")
print(f"🔄 Loading model: {model_name}")

# Clear GPU cache before loading
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_paths["TRANSFORMERS_CACHE"],
    trust_remote_code=True,
    pad_token="<|endoftext|>",  # Ensure we have a padding token
)

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically distribute across available devices
    cache_dir=cache_paths["TRANSFORMERS_CACHE"],
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for non-quantized parts
    low_cpu_mem_usage=True,  # Reduce CPU memory usage during loading
)

# Prepare model for k-bit training (essential for QLoRA)
model = prepare_model_for_kbit_training(model)

# Apply LoRA adapters
model = get_peft_model(model, lora_config)

# Print trainable parameters summary
model.print_trainable_parameters()

print(f"✅ Model loaded and configured for QLoRA training")

In [None]:
# ===================================================================
# Cell 5: Dataset Preparation (Chinese Instruction Following)
# ===================================================================
def create_chinese_instruction_dataset(size: int = 100) -> List[Dict]:
    """Create a small Chinese instruction-following dataset for demonstration"""

    # Sample Chinese instruction-response pairs
    sample_data = [
        {
            "instruction": "請用繁體中文回答：什麼是人工智慧？",
            "response": "人工智慧（AI）是指讓機器模擬人類智能行為的技術領域。它包括機器學習、深度學習、自然語言處理等技術，目標是創造能夠理解、學習和解決問題的智能系統。",
        },
        {
            "instruction": "解釋什麼是機器學習，並舉一個例子。",
            "response": "機器學習是人工智慧的一個分支，通過分析大量數據來發現模式並做出預測。例如：電子郵件垃圾郵件過濾器，它學習識別垃圾郵件的特徵，然後自動將可疑郵件歸類為垃圾郵件。",
        },
        {
            "instruction": "請推薦三本適合初學者的 Python 程式設計書籍。",
            "response": "以下是三本適合初學者的 Python 書籍：1. 《Python Crash Course》- 實務導向，適合快速入門。2. 《Automate the Boring Stuff with Python》- 著重自動化應用。3. 《Learning Python》- 詳細且全面的基礎教學。",
        },
    ]

    # Expand dataset by creating variations
    dataset = []
    for i in range(size):
        base_item = sample_data[i % len(sample_data)]
        dataset.append(
            {"instruction": base_item["instruction"], "response": base_item["response"]}
        )

    return dataset


def format_instruction_data(examples):
    """Format data for instruction following fine-tuning"""
    formatted_texts = []
    for instruction, response in zip(examples["instruction"], examples["response"]):
        # Use Qwen2.5 chat template format
        text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
        formatted_texts.append(text)
    return {"text": formatted_texts}


# Create and format dataset
raw_data = create_chinese_instruction_dataset(size=50)  # Small for demo
dataset = Dataset.from_list(raw_data)
dataset = dataset.map(
    format_instruction_data, batched=True, remove_columns=["instruction", "response"]
)

print(f"✅ Created dataset with {len(dataset)} examples")
print(f"Sample formatted text:\n{dataset[0]['text'][:200]}...")

In [None]:
# ===================================================================
# Cell 6: Training Configuration & Memory-Efficient Setup
# ===================================================================
def tokenize_function(examples):
    """Tokenize the formatted text data"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=system_config.get("max_seq_length", 1024),
        return_tensors=None,
    )
    # Add labels for causal language modeling (copy of input_ids)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"], desc="Tokenizing dataset"
)

# Training arguments optimized for low VRAM
training_args = TrainingArguments(
    output_dir=f"{AI_CACHE_ROOT}/qlora_checkpoints",
    # Training schedule
    num_train_epochs=1,  # Short training for demo
    per_device_train_batch_size=system_config.get("max_batch_size", 1),
    gradient_accumulation_steps=system_config.get("gradient_accumulation_steps", 4),
    # Memory optimization
    dataloader_pin_memory=False,  # Reduce memory usage
    gradient_checkpointing=True,  # Trade compute for memory
    fp16=False,  # Use bfloat16 instead (set in model)
    bf16=True if torch.cuda.is_available() else False,
    # Learning rate and optimization
    learning_rate=2e-4,  # Slightly higher for LoRA
    weight_decay=0.01,
    warmup_steps=10,
    # Logging and saving
    logging_steps=5,
    save_steps=25,
    save_total_limit=2,  # Keep only 2 checkpoints
    # Evaluation
    eval_steps=25,
    evaluation_strategy="steps",
    # Performance
    remove_unused_columns=False,
    report_to=None,  # Disable wandb/tensorboard for simplicity
    # Memory cleanup
    dataloader_num_workers=0,  # Avoid multiprocessing overhead
)

print("✅ Training arguments configured for low VRAM usage")

In [None]:
# ===================================================================
# Cell 7: Training Loop with Memory Monitoring
# ===================================================================
from transformers import DataCollatorForLanguageModeling

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Not masked language modeling
    pad_to_multiple_of=8,  # Optimize for tensor cores
)

# Split dataset for training and evaluation
train_dataset = tokenized_dataset.select(range(40))  # 80% for training
eval_dataset = tokenized_dataset.select(range(40, 50))  # 20% for evaluation


# Custom trainer class for memory monitoring
class MemoryMonitorTrainer(Trainer):
    def log(self, logs: Dict[str, float]) -> None:
        """Add GPU memory logging"""
        if torch.cuda.is_available():
            logs["gpu_memory_gb"] = torch.cuda.max_memory_allocated() / 1e9
            torch.cuda.reset_peak_memory_stats()
        super().log(logs)


# Initialize trainer
trainer = MemoryMonitorTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("🚀 Starting QLoRA fine-tuning...")
print(f"📊 Training samples: {len(train_dataset)}")
print(f"📊 Evaluation samples: {len(eval_dataset)}")

# Start training with memory monitoring
try:
    # Clear cache before training
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    training_result = trainer.train()
    print("✅ Training completed successfully!")
    print(f"📈 Final train loss: {training_result.training_loss:.4f}")

except Exception as e:
    print(f"❌ Training failed: {e}")
    print("💡 Try reducing batch_size or max_seq_length in system_config")

In [None]:
# ===================================================================
# Cell 8: Model Saving & Adapter Management
# ===================================================================
# Save the LoRA adapters (not the full model - saves space)
adapter_save_path = f"{AI_CACHE_ROOT}/qlora_adapters/qwen2.5-7b-chinese-instruct"
pathlib.Path(adapter_save_path).mkdir(parents=True, exist_ok=True)

# Save only the LoRA adapters
model.save_pretrained(adapter_save_path)
tokenizer.save_pretrained(adapter_save_path)

print(f"✅ LoRA adapters saved to: {adapter_save_path}")
print(f"📁 Adapter files: {list(pathlib.Path(adapter_save_path).glob('*'))}")

# Calculate adapter size
adapter_size_mb = (
    sum(
        f.stat().st_size
        for f in pathlib.Path(adapter_save_path).glob("**/*")
        if f.is_file()
    )
    / 1e6
)
print(f"💾 Adapter size: {adapter_size_mb:.1f} MB (vs ~13GB for full 7B model)")

In [None]:
# ===================================================================
# Cell 9: Inference Testing & Performance Comparison
# ===================================================================
def test_model_inference(prompt: str, max_new_tokens: int = 100):
    """Test the fine-tuned model with a sample prompt"""

    # Format prompt using chat template
    formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

    # Tokenize input
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode and clean response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[
        len(formatted_prompt.replace("<|im_start|>assistant\n", "")) :
    ]

    return response.strip()


# Test the fine-tuned model
test_prompts = [
    "什麼是深度學習？請用繁體中文簡單解釋。",
    "請推薦一個適合初學者的機器學習專案。",
    "解釋什麼是 Transformer 架構。",
]

print("🧪 Testing fine-tuned model:")
print("=" * 50)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n📝 Test {i}: {prompt}")
    print("🤖 Response:")
    try:
        response = test_model_inference(prompt, max_new_tokens=150)
        print(response)
    except Exception as e:
        print(f"❌ Error during inference: {e}")
    print("-" * 30)

In [None]:
# ===================================================================
# Cell 10: Memory Usage Analysis & Optimization Tips
# ===================================================================
def analyze_memory_usage():
    """Analyze current GPU memory usage and provide optimization tips"""

    if not torch.cuda.is_available():
        print("ℹ️  CPU mode - no GPU memory analysis available")
        return

    # Get memory statistics
    memory_allocated = torch.cuda.memory_allocated() / 1e9
    memory_reserved = torch.cuda.memory_reserved() / 1e9
    memory_total = torch.cuda.get_device_properties(0).total_memory / 1e9

    print("📊 GPU Memory Analysis:")
    print(f"   Allocated: {memory_allocated:.2f} GB")
    print(f"   Reserved:  {memory_reserved:.2f} GB")
    print(f"   Total:     {memory_total:.2f} GB")
    print(f"   Usage:     {(memory_allocated/memory_total)*100:.1f}%")

    # Provide optimization tips based on usage
    if memory_allocated / memory_total > 0.9:
        print("\n⚠️  High memory usage detected!")
        print("💡 Optimization tips:")
        print("   • Reduce batch_size or max_seq_length")
        print("   • Enable gradient_checkpointing=True")
        print("   • Use gradient_accumulation_steps to maintain effective batch size")
        print("   • Consider CPU offloading for optimizer states")
    elif memory_allocated / memory_total < 0.5:
        print("\n✅ Memory usage is comfortable")
        print("💡 You could potentially:")
        print("   • Increase batch_size for faster training")
        print("   • Use longer sequences (max_seq_length)")
        print("   • Try a larger LoRA rank (r=32 or r=64)")

    return {
        "allocated_gb": memory_allocated,
        "total_gb": memory_total,
        "usage_percent": (memory_allocated / memory_total) * 100,
    }


memory_stats = analyze_memory_usage()

# Additional optimization tips
print("\n🔧 QLoRA Optimization Checklist:")
print("✓ Use 4-bit quantization (NF4)")
print("✓ Enable gradient checkpointing")
print("✓ Use appropriate LoRA rank (8-64)")
print("✓ Monitor gradient accumulation steps")
print("✓ Consider sequence length vs batch size tradeoff")
print("✓ Use bfloat16 instead of float32")
print("✓ Disable unnecessary logging/callbacks")

In [None]:
# ===================================================================
# Cell 11: Smoke Test & Validation
# ===================================================================
def run_smoke_test():
    """Quick validation that everything works correctly"""

    tests = []

    # Test 1: Model can generate text
    try:
        test_output = test_model_inference("你好", max_new_tokens=20)
        tests.append(
            (
                "Text generation",
                len(test_output) > 0,
                "✅" if len(test_output) > 0 else "❌",
            )
        )
    except Exception as e:
        tests.append(("Text generation", False, f"❌ Error: {e}"))

    # Test 2: Adapters were saved correctly
    adapter_files = list(pathlib.Path(adapter_save_path).glob("adapter_*.bin"))
    tests.append(
        (
            "Adapter saving",
            len(adapter_files) > 0,
            "✅" if len(adapter_files) > 0 else "❌",
        )
    )

    # Test 3: Memory usage is reasonable
    if torch.cuda.is_available():
        memory_ok = memory_stats["usage_percent"] < 95
        tests.append(("Memory usage", memory_ok, "✅" if memory_ok else "⚠️"))

    # Test 4: Tokenizer works correctly
    try:
        tokens = tokenizer("測試中文tokenization", return_tensors="pt")
        tests.append(("Tokenizer", tokens["input_ids"].numel() > 0, "✅"))
    except Exception:
        tests.append(("Tokenizer", False, "❌"))

    print("🧪 Smoke Test Results:")
    print("=" * 40)
    for test_name, passed, status in tests:
        print(f"{status} {test_name}: {'PASS' if passed else 'FAIL'}")

    all_passed = all(test[1] for test in tests)
    print(
        f"\n{'✅ All tests passed!' if all_passed else '⚠️  Some tests failed - check configuration'}"
    )

    return all_passed


smoke_test_passed = run_smoke_test()

In [None]:
# ===================================================================
# Final Smoke Test & Acceptance Criteria (5-line validation)
# ===================================================================

# Test: QLoRA fine-tuning pipeline completion
assert pathlib.Path(
    f"{AI_CACHE_ROOT}/qlora_adapters/qwen2.5-7b-chinese-instruct"
).exists(), "❌ Adapters not saved"
assert (
    len(
        list(
            pathlib.Path(
                f"{AI_CACHE_ROOT}/qlora_adapters/qwen2.5-7b-chinese-instruct"
            ).glob("adapter_*.bin")
        )
    )
    > 0
), "❌ No adapter files found"
test_response = test_model_inference("什麼是AI？", max_new_tokens=50)
assert len(test_response) > 10, f"❌ Model output too short: {test_response}"
print(f"✅ QLoRA fine-tuning pipeline validated successfully!")
print(
    f"📊 Adapter size: {sum(f.stat().st_size for f in pathlib.Path(f'{AI_CACHE_ROOT}/qlora_adapters/qwen2.5-7b-chinese-instruct').glob('**/*') if f.is_file()) / 1e6:.1f} MB"
)
print(f"🧪 Sample output: {test_response[:100]}...")


## **6. 本章小結**

### **✅ 完成項目**
- **QLoRA 4-bit 量化微調流程** - 成功在 8GB VRAM 環境下微調 7B 模型
- **記憶體優化策略整合** - 梯度檢查點、動態批次大小、CPU offloading
- **中文指令跟隨資料集** - 建立並格式化繁體中文訓練資料
- **適配器管理系統** - LoRA 權重的儲存、載入與版本控制
- **效能評估與比較** - 微調前後的生成品質與記憶體使用分析

### **🔬 核心原理要點**
- **4-bit NF4 量化 (bitsandbytes)** - 將模型權重量化為 4-bit，大幅降低記憶體需求
- **LoRA 低秩適應 (PEFT)** - 只訓練少量適配器參數，保持原模型凍結
- **梯度檢查點技術** - 以計算時間換取記憶體空間，突破 VRAM 限制
- **記憶體分層管理** - GPU/CPU 混合運算，自動 offloading 優化
- **動態批次調整** - 根據硬體能力自適應訓練配置

### **⚠️ 常見坑與解決方案**
- **OOM 錯誤** → 降低 `per_device_train_batch_size`，增加 `gradient_accumulation_steps`
- **量化精度損失** → 使用 `bnb_4bit_use_double_quant=True` 提升精度
- **訓練不收斂** → 調整學習率 (2e-4 to 5e-4)，檢查 LoRA rank 設定
- **推理速度慢** → 合併適配器權重，或使用專門的推理引擎

### **🚀 下一步建議**
1. **進階微調技術** → 探索 DPO (Direct Preference Optimization) 對齊方法
2. **領域特化微調** → 針對醫療、法律、金融等特定領域進行 QLoRA 微調
3. **多模態擴展** → 結合視覺-語言模型進行多模態 QLoRA 微調
4. **評估體系完善** → 建立更全面的中文任務評估基準
5. **生產部署優化** → 整合 vLLM、TensorRT-LLM 等推理加速框架

---

## **🎯 階段性總結與下一步選項比較**

### **已完成核心技能棧**
✅ **RAG 基礎檢索問答** (E1) - FAISS 向量檢索 + PDF 文件處理  
✅ **Function Calling 工具使用** (C4) - LangChain 工具整合與函數調用  
✅ **QLoRA 低資源微調** (D2) - 4-bit 量化微調大型語言模型

### **下一階段優先選項分析**

**🔥 選項 A: 多代理協作系統 (E4 - Multi-Agent Collaboration)**
```
✅ 優勢: 
- 建構完整的 AI 工作流程 (Research → Plan → Write → Review)
- 結合已學的 RAG + Function Calling 技能
- 實用性極高，可直接應用於內容創作、研究報告等場景

⚠️ 挑戰:
- 需要設計代理間的通訊協議與任務分配邏輯
- 計算資源需求較高 (多個模型實例同時運行)
- 複雜度高，除錯與優化較困難

📊 技能收穫: 系統架構設計、工作流程編排、代理通訊協議
🎯 應用場景: 自動化內容生產、研究助手、決策支援系統
```

**🔥 選項 B: DPO 偏好對齊微調 (D5 - DPO vs RLHF)**
```
✅ 優勢:
- 深化微調技能，學習最新的對齊技術
- 相對 RLHF 更簡單，計算需求較低
- 可直接基於已完成的 QLoRA 基礎進行擴展

⚠️ 挑戰:
- 需要準備高品質的偏好資料集
- 對齊評估較為主觀，需要人工標註
- 理論概念較為複雜 (偏好學習、Bradley-Terry 模型)

📊 技能收穫: 偏好學習、對齊技術、人類反饋整合
🎯 應用場景: 安全 AI 系統、客戶服務機器人、內容審核
```

**🔥 選項 C: 多模態 RAG 系統 (E2 - Multimodal RAG with CLIP)**
```
✅ 優勢:
- 擴展 RAG 能力至圖像+文本檢索
- 學習 CLIP/BLIP 等視覺-語言模型
- 應用場景豐富 (電商搜尋、文檔分析、多媒體問答)

⚠️ 挑戰:
- 需要處理更複雜的資料類型與向量空間
- 模型複雜度增加，調試困難
- 評估指標設計更加複雜

📊 技能收穫: 多模態模型應用、視覺特徵提取、跨模態檢索
🎯 應用場景: 智能客服、商品搜尋、醫療影像問答
```

**🔥 選項 D: Gradio WebUI 整合 (F1 - Production-Ready Interface)**
```
✅ 優勢:
- 將所有技能整合成可用的產品介面
- 學習前端整合與使用者體驗設計
- 可立即展示學習成果，成就感強

⚠️ 挑戰:
- 前端技能需求 (雖然 Gradio 簡化了很多)
- 需要考慮並發、安全性等生產環境問題
- 整合複雜度高，可能遇到各種相容性問題

📊 技能收穫: 全棧開發、UI/UX 設計、系統整合
🎯 應用場景: 企業內部工具、產品原型、技術展示
```

---

### **💡 我的建議優先序**

**🥇 首選：選項 A - 多代理協作系統 (E4)**
- **理由**：能夠有機整合前面學到的所有技能 (RAG + Tools + Fine-tuning)
- **學習價值**：系統性思維、架構設計、複雜問題分解
- **實用性**：可直接用於自動化研究、內容創作等實際場景
- **技能進階**：從單點技術走向系統工程思維

**🥈 次選：選項 C - 多模態 RAG (E2)**  
- **理由**：在 RAG 基礎上自然延伸，技術挑戰適中
- **學習價值**：多模態 AI 是未來趨勢，值得投資
- **差異化**：相對少見的技能，具有競爭優勢

**🥉 第三：選項 D - WebUI 整合 (F1)**
- **理由**：整合展示，驗證所有技能的可用性
- **時機考量**：建議在完成更多核心技能後再進行

**🎯 您偏好哪個選項？**

請告訴我您想要優先學習哪個方向，我將立即準備對應的詳細 notebook 內容！