In [None]:
# nb20_lora_peft_tuning.ipynb
# LoRA (Low-Rank Adaptation) 微調實戰

# Cell 1: Environment Setup and Shared Cache
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB"
    )

# Install required packages
# !pip install transformers==4.36.0 peft==0.7.1 datasets==2.14.0 accelerate bitsandbytes

In [None]:
# Cell 2: LoRA Theory and Configuration
"""
LoRA (Low-Rank Adaptation) 原理：
- 原始權重矩陣 W ∈ R^(d×k) 保持凍結
- 添加低秩分解：ΔW = BA，其中 B ∈ R^(d×r), A ∈ R^(r×k)
- r << min(d,k)，大幅減少可訓練參數
- 前向傳播：h = (W + α/r * BA)x

關鍵參數：
- r (rank): 低秩維度，通常 4-64，越大表達能力越強但參數越多
- α (alpha): 縮放因子，控制 LoRA 貢獻度
- target_modules: 要應用 LoRA 的模組（通常是 attention 的 q,k,v,o）
"""

from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
import torch
import json

# LoRA configuration - conservative settings for stable training
lora_config = LoraConfig(
    r=16,  # rank - balance between performance and efficiency
    lora_alpha=32,  # scaling factor (typically 2*r)
    target_modules=[  # apply LoRA to attention layers
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",  # MLP layers for better coverage
    ],
    lora_dropout=0.1,  # dropout for regularization
    bias="none",  # don't train bias terms
    task_type=TaskType.CAUSAL_LM,
)

print("LoRA Config:")
print(f"  Rank (r): {lora_config.r}")
print(f"  Alpha: {lora_config.lora_alpha}")
print(f"  Target modules: {lora_config.target_modules}")
print(f"  Dropout: {lora_config.lora_dropout}")

In [None]:
# Cell 3: Base Model Loading with Low-VRAM Settings
model_name = "Qwen/Qwen2.5-7B-Instruct"  # Good Chinese performance

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True, cache_dir=os.environ["TRANSFORMERS_CACHE"]
)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with memory optimization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # automatic device placement
    torch_dtype=torch.float16,  # half precision
    load_in_4bit=True,  # 4-bit quantization for VRAM efficiency
    trust_remote_code=True,
    cache_dir=os.environ["TRANSFORMERS_CACHE"],
)

print(f"Model loaded: {model_name}")
print(f"Vocab size: {len(tokenizer)}")
print(f"Model device: {next(model.parameters()).device}")

In [None]:
# Cell 4: Chinese Instruction Dataset Preparation
"""
創建中文指令數據集 - 包含日常對話、知識問答、任務指令等
格式：{"instruction": "問題", "output": "回答"}
"""

# Sample Chinese instruction data - mix of conversation, QA, and tasks
chinese_instructions = [
    # Daily conversation
    {
        "instruction": "你好，你是誰？",
        "output": "你好！我是一個AI助手，很高興為您服務。我可以回答問題、協助任務，或與您聊天。有什麼我可以幫助您的嗎？",
    },
    {
        "instruction": "今天天氣如何？",
        "output": "很抱歉，我無法獲取即時天氣資訊。建議您查看天氣應用程式或氣象網站來了解當前天氣狀況。",
    },
    {
        "instruction": "推薦一些好看的電影",
        "output": "我推薦幾部不同類型的優秀電影：《肖申克的救贖》（劇情）、《星際效應》（科幻）、《千與千尋》（動畫）、《寄生蟲》（驚悚）。您偏好哪種類型呢？",
    },
    # Knowledge QA
    {
        "instruction": "什麼是人工智慧？",
        "output": "人工智慧（AI）是指讓機器模擬人類智能的技術，包括學習、推理、感知、決策等能力。主要分支有機器學習、深度學習、自然語言處理、電腦視覺等。",
    },
    {
        "instruction": "台灣的首都是哪裡？",
        "output": "台灣的首都是台北市。台北是台灣的政治、經濟、文化中心，也是最大的都市。",
    },
    {
        "instruction": "解釋什麼是機器學習",
        "output": "機器學習是人工智慧的一個分支，讓電腦通過數據自動學習和改進，而不需要明確編程。主要類型包括監督學習、無監督學習和強化學習。",
    },
    # Task instructions
    {
        "instruction": "幫我寫一封感謝信",
        "output": "親愛的[收件人姓名]：\n\n感謝您在[具體事件]中給予的幫助和支持。您的[具體幫助內容]對我來說意義重大，讓我能夠[達成的結果]。\n\n再次表達我的誠摯謝意，期待未來有機會回報您的恩情。\n\n此致\n敬禮\n\n[您的姓名]\n[日期]",
    },
    {
        "instruction": "總結以下文字的重點",
        "output": "很抱歉，您沒有提供需要總結的文字內容。請提供具體的文字材料，我將為您提取重點和關鍵資訊。",
    },
    {"instruction": "翻譯：Hello, how are you?", "output": "你好，你好嗎？"},
    {
        "instruction": "用Python寫一個計算階乘的函數",
        "output": "```python\ndef factorial(n):\n    if n < 0:\n        return None  # 負數沒有階乘\n    elif n == 0 or n == 1:\n        return 1\n    else:\n        return n * factorial(n - 1)\n\n# 使用範例\nprint(factorial(5))  # 輸出: 120\n```",
    },
]

# Extend dataset with more samples (you can add more or load from file)
extended_data = chinese_instructions * 5  # Repeat for more training data


# Format data for training (Chat template format)
def format_instruction(example):
    """Format instruction-response pair using chat template"""
    messages = [
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["output"]},
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"text": text}


# Convert to formatted dataset
from datasets import Dataset

formatted_data = [format_instruction(ex) for ex in extended_data]
train_dataset = Dataset.from_list(formatted_data)

print(f"Training dataset size: {len(train_dataset)}")
print("\nSample formatted text:")
print(train_dataset[0]["text"][:300] + "...")

In [None]:
# Cell 5: LoRA Model Setup and Wrapping
"""
使用 PEFT 將 LoRA 適配器包裝到基礎模型上
只有 LoRA 參數會被訓練，基礎模型保持凍結
"""

# Wrap model with LoRA
peft_model = get_peft_model(model, lora_config)


# Print trainable parameters info
def print_trainable_parameters(model):
    """Print number of trainable parameters"""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"Trainable params: {trainable_params:,} || All params: {all_param:,}")
    print(f"Trainable%: {100 * trainable_params / all_param:.2f}%")


print_trainable_parameters(peft_model)

# Verify LoRA modules are added
print("\nLoRA modules added:")
for name, module in peft_model.named_modules():
    if "lora" in name.lower():
        print(f"  {name}")

In [None]:
# Cell 6: Training Configuration with Memory Optimization
"""
訓練參數設定，針對低顯存環境優化
"""


# Tokenization function for training
def tokenize_function(examples):
    """Tokenize text for training"""
    # Tokenize
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,  # Will be handled by data collator
        max_length=512,  # Reasonable length for efficiency
        return_tensors=None,
    )

    # For causal LM, labels are same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


# Apply tokenization
tokenized_dataset = train_dataset.map(
    tokenize_function, batched=True, remove_columns=train_dataset.column_names
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    pad_to_multiple_of=8,  # For efficiency
)

# Training arguments with memory optimization
training_args = TrainingArguments(
    output_dir=f"{AI_CACHE_ROOT}/lora_checkpoints",
    # Training schedule
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Small batch size for memory
    gradient_accumulation_steps=8,  # Effective batch size = 1*8 = 8
    # Optimization
    learning_rate=2e-4,  # Higher LR often works well for LoRA
    weight_decay=0.01,
    warmup_steps=100,
    # Memory optimization
    dataloader_pin_memory=False,
    gradient_checkpointing=True,  # Trade compute for memory
    fp16=True,  # Half precision training
    # Logging and saving
    logging_steps=10,
    save_steps=50,
    save_total_limit=3,  # Keep only recent checkpoints
    # Misc
    remove_unused_columns=False,
    report_to=None,  # Disable wandb/tensorboard
    seed=42,
)

print("Training Configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Output dir: {training_args.output_dir}")

In [None]:
# Cell 7: Training Loop Execution
"""
執行 LoRA 微調訓練
監控損失下降和記憶體使用
"""

# Initialize trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Check memory before training
if torch.cuda.is_available():
    print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

print("\n🚀 Starting LoRA fine-tuning...")
print("This may take 10-30 minutes depending on your hardware.")

# Start training
try:
    training_result = trainer.train()
    print("\n✅ Training completed successfully!")
    print(f"Final train loss: {training_result.training_loss:.4f}")

except Exception as e:
    print(f"\n❌ Training failed: {e}")
    print("Try reducing batch size or sequence length if OOM occurs.")

# Check memory after training
if torch.cuda.is_available():
    print(f"GPU memory after training: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

In [None]:
# Cell 8: Model Saving and Loading Verification
"""
保存 LoRA 適配器並驗證載入
"""

# Save LoRA adapter (only saves the small adapter weights)
lora_save_path = f"{AI_CACHE_ROOT}/lora_adapters/qwen2.5-7b-chinese-instruct"
peft_model.save_pretrained(lora_save_path)

print(f"✅ LoRA adapter saved to: {lora_save_path}")

# Check saved files
import os

saved_files = os.listdir(lora_save_path)
print(f"Saved files: {saved_files}")

# Calculate adapter size
total_size = sum(
    os.path.getsize(os.path.join(lora_save_path, f)) for f in saved_files
) / (
    1024 * 1024
)  # Convert to MB
print(f"Adapter size: {total_size:.2f} MB")

# Test loading the adapter
print("\n🔄 Testing adapter loading...")
from peft import PeftModel

# Load base model again (simulate fresh start)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True,
    cache_dir=os.environ["TRANSFORMERS_CACHE"],
)

# Load and apply LoRA adapter
loaded_model = PeftModel.from_pretrained(base_model, lora_save_path)
print("✅ LoRA adapter loaded successfully!")

In [None]:
# Cell 9: Before/After Comparison
"""
比較微調前後的中文回應品質
"""


def generate_response(model, prompt, max_length=100):
    """Generate response using the model"""
    # Format as chat
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
        )

    # Decode response (remove input prompt)
    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
    )
    return response.strip()


# Test prompts
test_prompts = [
    "你好，請自我介紹一下",
    "什麼是機器學習？",
    "幫我寫一首關於春天的詩",
    "Python 和 Java 有什麼差別？",
]

print("📊 微調前後效果比較\n")

# Compare base model vs fine-tuned model
for i, prompt in enumerate(test_prompts, 1):
    print(f"測試 {i}: {prompt}")
    print("-" * 50)

    # Original model response
    try:
        original_response = generate_response(model, prompt, max_length=80)
        print(f"原始模型: {original_response}")
    except Exception as e:
        print(f"原始模型: [Error] {e}")

    # Fine-tuned model response
    try:
        finetuned_response = generate_response(loaded_model, prompt, max_length=80)
        print(f"微調模型: {finetuned_response}")
    except Exception as e:
        print(f"微調模型: [Error] {e}")

    print("\n")

In [None]:
# Cell 10: Smoke Test and Summary
"""
驗收測試：確保所有組件正常工作
"""


def smoke_test():
    """Simple smoke test for LoRA fine-tuning"""
    tests = []

    # Test 1: LoRA config is valid
    try:
        assert lora_config.r > 0, "LoRA rank should be positive"
        assert len(lora_config.target_modules) > 0, "Should have target modules"
        tests.append("✅ LoRA config valid")
    except Exception as e:
        tests.append(f"❌ LoRA config failed: {e}")

    # Test 2: Model has LoRA parameters
    try:
        lora_params = sum(
            1 for name, _ in peft_model.named_parameters() if "lora" in name
        )
        assert lora_params > 0, "Should have LoRA parameters"
        tests.append(f"✅ LoRA parameters added ({lora_params} tensors)")
    except Exception as e:
        tests.append(f"❌ LoRA parameters failed: {e}")

    # Test 3: Adapter can be saved and loaded
    try:
        assert os.path.exists(lora_save_path), "LoRA adapter should be saved"
        assert os.path.exists(
            f"{lora_save_path}/adapter_config.json"
        ), "Config should exist"
        tests.append("✅ Adapter save/load works")
    except Exception as e:
        tests.append(f"❌ Adapter save/load failed: {e}")

    # Test 4: Model can generate text
    try:
        test_response = generate_response(loaded_model, "你好", max_length=20)
        assert len(test_response) > 0, "Should generate non-empty response"
        tests.append("✅ Text generation works")
    except Exception as e:
        tests.append(f"❌ Text generation failed: {e}")

    return tests


# Run smoke test
print("🧪 執行驗收測試...")
test_results = smoke_test()
for result in test_results:
    print(result)

print(
    f"\n✅ 通過測試: {sum(1 for r in test_results if r.startswith('✅'))}/{len(test_results)}"
)

# Summary statistics
print("\n📈 訓練總結:")
print(f"基礎模型: {model_name}")
print(f"LoRA rank: {lora_config.r}")
print(f"訓練樣本: {len(train_dataset)}")
print(f"適配器大小: {total_size:.2f} MB")
print(
    f"可訓練參數比例: {100 * sum(p.numel() for p in peft_model.parameters() if p.requires_grad) / sum(p.numel() for p in peft_model.parameters()):.2f}%"
)

print("\n🎯 重要概念回顧:")
print("• LoRA 通過低秩分解大幅減少可訓練參數")
print("• 只需保存小型適配器文件，基礎模型可重用")
print("• rank 參數控制表達能力與效率的平衡")
print("• 適合在有限資源下進行模型個性化")

print("\n🚀 下一步建議:")
print("• 嘗試不同的 rank 值 (4, 8, 32, 64)")
print("• 測試更多 target_modules 組合")
print("• 收集更多高品質的中文指令數據")
print("• 進行量化評估 (ROUGE, BLEU, 人工評分)")
print("• 學習 QLoRA (nb21) 實現更低顯存微調")

In [None]:
# Quick smoke test for LoRA fine-tuning completion
def quick_lora_test():
    assert "lora_config" in locals(), "LoRA config should be defined"
    assert os.path.exists(lora_save_path), "LoRA adapter should be saved"
    response = generate_response(loaded_model, "你好", max_length=10)
    assert len(response) > 0, "Model should generate text"
    print("✅ LoRA fine-tuning smoke test passed!")


quick_lora_test()


## 6. 本章小結

### ✅ 完成項目
- **LoRA 原理與實作**：低秩適應器配置與應用
- **PEFT 庫整合**：Hugging Face PEFT 無縫整合
- **中文指令微調**：繁體中文對話與任務數據
- **記憶體優化**：4-bit 量化 + 梯度檢查點
- **效果評估**：微調前後中文回應品質比較

### 🔧 核心原理要點
- **低秩分解**：ΔW = BA，r << min(d,k) 大幅減少參數
- **參數效率**：僅 0.1-1% 參數可訓練，適配器檔案小
- **模組化設計**：適配器可獨立保存、載入、分享
- **記憶體友善**：梯度檢查點 + 量化實現低 VRAM 微調

### ⚠️ 常見坑點
- **rank 選擇**：太小表達能力不足，太大接近全量微調
- **目標模組**：需根據模型架構調整 target_modules
- **數據格式**：確保使用正確的聊天模板格式
- **顯存管理**：訓練時監控記憶體，適時調整批次大小

### 🚀 下一步建議
1. **進階技術**：學習 QLoRA (nb21) 實現 int4 量化微調
2. **數據擴展**：收集更多高品質中文指令-回應對
3. **超參調優**：系統性測試不同 rank、alpha、學習率組合
4. **評估完善**：建立自動化評估管線 (ROUGE, 人工評分)
5. **應用整合**：將微調後模型整合到 RAG/Agent 系統

---

**記憶體需求評估**：
- 4GB VRAM：可微調 7B 模型 (batch_size=1)
- 8GB VRAM：可用更大批次或更高 rank
- 12GB+ VRAM：可嘗試 14B 模型微調

這個 notebook 提供了完整的 LoRA 微調流程，從理論到實作，特別針對中文場景和低顯存環境優化。您想繼續進行 QLoRA (nb21) 還是先完善當前的微調評估？