In [None]:
# nb11_instruction_tuning_demo.ipynb
# 指令調優示範 - Instruction Tuning with LoRA/QLoRA

# === Cell 1: Shared Cache Bootstrap & Environment Setup ===
import os, pathlib, torch, warnings

warnings.filterwarnings("ignore", category=UserWarning)

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}
for k, v in paths.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB"
    )

In [None]:
# Check required packages
try:
    import transformers, datasets, peft, bitsandbytes
    from transformers import (
        AutoTokenizer,
        AutoModelForCausalLM,
        TrainingArguments,
        Trainer,
    )
    from peft import LoraConfig, get_peft_model, TaskType, PeftModel
    from datasets import load_dataset
    import json, random

    print("✅ All required packages imported successfully")
except ImportError as e:
    print(f"❌ Missing package: {e}")
    print(
        "Install with: pip install transformers datasets peft bitsandbytes accelerate"
    )

In [None]:
# === Cell 2: Load and Explore Instruction Dataset ===
print("🔍 Loading instruction dataset...")

# Load Alpaca-style instruction dataset (Stanford Alpaca or similar)
try:
    # Option 1: Stanford Alpaca (English)
    dataset = load_dataset("tatsu-lab/alpaca", split="train")
    print(f"✅ Loaded Stanford Alpaca dataset: {len(dataset)} examples")
except:
    try:
        # Option 2: Databricks Dolly (alternative)
        dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
        print(f"✅ Loaded Databricks Dolly dataset: {len(dataset)} examples")
    except:
        # Option 3: Chinese instruction dataset (fallback)
        print("⚠️ Using synthetic examples (original datasets unavailable)")
        synthetic_data = [
            {
                "instruction": "請解釋什麼是機器學習",
                "input": "",
                "output": "機器學習是人工智慧的一個分支，它使計算機系統能夠通過經驗自動學習和改進，而無需被明確編程。",
            },
            {
                "instruction": "將以下句子翻譯成英文",
                "input": "今天天氣很好",
                "output": "The weather is very nice today.",
            },
            {
                "instruction": "列出三個程式設計的最佳實踐",
                "input": "",
                "output": "1. 寫清晰的註解和文檔\n2. 使用有意義的變數和函數名稱\n3. 保持代碼簡潔和模組化",
            },
        ]
        from datasets import Dataset

        dataset = Dataset.from_list(synthetic_data * 100)  # Repeat for demo
        print(f"✅ Created synthetic dataset: {len(dataset)} examples")

# Explore dataset structure
print("\n📊 Dataset Structure:")
print(f"Columns: {dataset.column_names}")
print(f"First example:")
example = dataset[0]
for key, value in example.items():
    print(f"  {key}: {repr(value[:100] + '...' if len(str(value)) > 100 else value)}")

# Basic statistics
instructions_with_input = sum(1 for item in dataset if item.get("input", "").strip())
print(f"\n📈 Dataset Statistics:")
print(f"Total examples: {len(dataset)}")
print(f"Examples with input: {instructions_with_input}")
print(f"Examples without input: {len(dataset) - instructions_with_input}")

In [None]:
# === Cell 3: Data Preprocessing and Formatting ===
print("🔧 Preprocessing instruction data...")


def format_instruction(example):
    """
    Format instruction data into a single text for training.
    Uses Alpaca-style prompt template.
    """
    instruction = example["instruction"].strip()
    input_text = example.get("input", "").strip()
    output = example["output"].strip()

    if input_text:
        # Instruction with input
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
    else:
        # Instruction only
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"

    # Full text for training (prompt + response)
    full_text = prompt + output

    return {
        "prompt": prompt,
        "response": output,
        "full_text": full_text,
        "length": len(full_text),
    }


# Apply formatting
formatted_dataset = dataset.map(format_instruction)

# Filter by length (avoid too long sequences)
MAX_LENGTH = 512  # Adjust based on your GPU memory
filtered_dataset = formatted_dataset.filter(lambda x: x["length"] <= MAX_LENGTH)

print(
    f"✅ Formatted dataset: {len(formatted_dataset)} → {len(filtered_dataset)} examples (after length filtering)"
)

# Show example
print("\n📝 Formatted Example:")
example = filtered_dataset[0]
print("Prompt:")
print(example["prompt"])
print("Response:")
print(example["response"])
print(f"Total length: {example['length']} chars")

In [None]:
# === Cell 4: Load Base Model and Baseline Test ===
print("🤖 Loading base model for instruction tuning...")

# Model selection based on available VRAM
MODEL_NAME = "microsoft/DialoGPT-small"  # Lightweight for demo
# Alternative: "Qwen/Qwen2.5-0.5B-Instruct" or "google/flan-t5-small"

# Configure 4-bit quantization for low VRAM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=(
        torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    ),
)

print(f"Loading model: {MODEL_NAME}")
try:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config if torch.cuda.is_available() else None,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True,
    )

    print(f"✅ Model loaded successfully")
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Falling back to CPU-only mode...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


# Test baseline performance
def test_instruction_following(model, tokenizer, prompt, max_length=100):
    """Test model's instruction following capability"""
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + max_length,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated[len(prompt) :].strip()


# Baseline test
test_prompt = "### Instruction:\nExplain what is machine learning in simple terms.\n\n### Response:\n"
baseline_response = test_instruction_following(model, tokenizer, test_prompt)
print(f"\n🧪 Baseline Test:")
print(f"Prompt: {test_prompt}")
print(f"Response: {baseline_response}")

In [None]:
# === Cell 5: LoRA Configuration and Training Setup ===
print("⚙️ Setting up LoRA for parameter-efficient fine-tuning...")

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,  # Low rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Attention layers
    # Note: target_modules may vary by model architecture
)

# Apply LoRA to model
try:
    model = get_peft_model(model, lora_config)
    print("✅ LoRA applied successfully")
    print(f"Trainable parameters: {model.num_parameters(only_trainable=True):,}")
    print(f"Total parameters: {model.num_parameters():,}")
    print(
        f"Trainable %: {100 * model.num_parameters(only_trainable=True) / model.num_parameters():.2f}%"
    )
except Exception as e:
    print(f"⚠️ Error applying LoRA: {e}")
    print(
        "This might be due to model architecture. Continuing with full fine-tuning..."
    )

# Data collator for language modeling
from transformers import DataCollatorForLanguageModeling


def tokenize_function(examples):
    """Tokenize the full text for training"""
    return tokenizer(
        examples["full_text"], truncation=True, padding=False, max_length=MAX_LENGTH
    )


# Tokenize dataset
train_dataset = filtered_dataset.map(
    tokenize_function, batched=True, remove_columns=filtered_dataset.column_names
).shuffle(seed=42)

# Take a subset for quick demo (full dataset for real training)
train_dataset = train_dataset.select(range(min(100, len(train_dataset))))

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal language modeling
)

print(f"✅ Prepared training dataset: {len(train_dataset)} examples")

In [None]:
# === Cell 6: Lightweight Fine-tuning Execution ===
print("🚀 Starting instruction fine-tuning...")

# Training arguments (conservative for demo)
training_args = TrainingArguments(
    output_dir=f"{AI_CACHE_ROOT}/instruction_tuning_demo",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Very short for demo
    per_device_train_batch_size=1,  # Low VRAM friendly
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # Conservative learning rate
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",  # Don't save checkpoints for demo
    dataloader_drop_last=False,
    fp16=torch.cuda.is_available(),  # Mixed precision if available
    report_to=None,  # Disable wandb/tensorboard
    remove_unused_columns=False,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("📊 Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(
    f"  Total steps: ~{len(train_dataset) // training_args.per_device_train_batch_size}"
)

# Start training
try:
    print("\n🎯 Starting training...")
    trainer.train()
    print("✅ Training completed successfully!")

    # Save LoRA adapter
    if hasattr(model, "save_pretrained"):
        adapter_path = f"{AI_CACHE_ROOT}/instruction_lora_adapter"
        model.save_pretrained(adapter_path)
        print(f"💾 LoRA adapter saved to: {adapter_path}")

except Exception as e:
    print(f"⚠️ Training error: {e}")
    print(
        "This might be due to memory constraints. Try reducing batch size or sequence length."
    )


In [None]:
# === Cell 7: Post-Training Evaluation and Comparison ===
print("📈 Evaluating fine-tuned model...")

# Test the same prompt as baseline
tuned_response = test_instruction_following(model, tokenizer, test_prompt)

print("🔄 Before vs After Comparison:")
print("=" * 50)
print("BASELINE (pre-training):")
print(baseline_response)
print("\n" + "=" * 50)
print("FINE-TUNED (post-training):")
print(tuned_response)
print("=" * 50)

# Additional test cases
test_cases = [
    "### Instruction:\nList three benefits of exercise.\n\n### Response:\n",
    "### Instruction:\nWrite a haiku about technology.\n\n### Response:\n",
    "### Instruction:\nExplain why the sky is blue.\n\n### Response:\n",
]

print("\n🧪 Additional Test Cases:")
for i, prompt in enumerate(test_cases):
    print(f"\nTest {i+1}:")
    print(
        f"Instruction: {prompt.split('Response:')[0].replace('### Instruction:', '').replace('###', '').strip()}"
    )
    response = test_instruction_following(model, tokenizer, prompt, max_length=80)
    print(f"Response: {response}")

# Qualitative assessment checklist
print("\n✅ Qualitative Assessment Checklist:")
print("1. Does the model follow instructions more consistently? [Manual check]")
print("2. Are responses more structured and coherent? [Manual check]")
print("3. Does the model use appropriate formatting? [Manual check]")
print("4. Is the model still safe and not generating harmful content? [Manual check]")

In [None]:
# === Cell 8: Smoke Test and Usage Notes ===
print("🧪 Final Smoke Test...")


def smoke_test():
    """Quick validation that everything works"""
    try:
        # Test tokenizer
        test_text = "Hello world"
        tokens = tokenizer(test_text, return_tensors="pt")
        assert tokens["input_ids"].shape[1] > 0

        # Test model inference
        with torch.no_grad():
            outputs = model(**tokens)
            assert outputs.logits.shape[-1] == tokenizer.vocab_size

        # Test instruction format
        formatted = format_instruction(
            {"instruction": "Test instruction", "input": "", "output": "Test output"}
        )
        assert "### Instruction:" in formatted["full_text"]
        assert "### Response:" in formatted["full_text"]

        print("✅ All smoke tests passed!")
        return True

    except Exception as e:
        print(f"❌ Smoke test failed: {e}")
        return False


smoke_test_result = smoke_test()

print("\n📚 Usage Notes and Next Steps:")
print(
    """
🎯 What we accomplished:
- Loaded and formatted instruction dataset (Alpaca-style)
- Applied LoRA for parameter-efficient fine-tuning
- Trained model on instruction-following tasks
- Compared before/after performance

🔧 Key Parameters:
- LoRA rank: 16 (balance between efficiency and capacity)
- Learning rate: 2e-4 (conservative to avoid catastrophic forgetting)
- Max sequence length: 512 (adjust based on your needs)

⚠️ Common Issues:
- OOM errors: Reduce batch_size, use gradient_checkpointing
- Poor convergence: Increase epochs, check learning rate
- Quality issues: Need more diverse training data

🚀 Next Steps:
- Scale up to larger datasets (10K+ examples)
- Try different base models (Qwen2.5, Yi, Llama)
- Experiment with QLoRA for even lower memory usage
- Add evaluation metrics (ROUGE, BLEU, human eval)
- Deploy as a chat interface

💡 Production Tips:
- Use validation set to prevent overfitting
- Monitor loss curves and stop early if needed
- Consider multi-epoch training with lr scheduling
- Save multiple checkpoints for comparison
"""
)

print(f"\n🎉 nb11 Instruction Tuning Demo completed!")
print(f"💾 Model artifacts saved to: {AI_CACHE_ROOT}/")
print(f"🔧 Smoke test status: {'PASSED' if smoke_test_result else 'FAILED'}")

In [None]:
# Quick validation that instruction tuning setup works correctly
def validate_instruction_tuning():
    checks = []

    # Check 1: Dataset formatting
    try:
        sample = format_instruction(
            {"instruction": "Test", "input": "", "output": "Response"}
        )
        assert "### Instruction:" in sample["full_text"]
        checks.append("✅ Dataset formatting works")
    except:
        checks.append("❌ Dataset formatting failed")

    # Check 2: Model loading with quantization
    try:
        assert model is not None
        assert tokenizer is not None
        checks.append("✅ Model and tokenizer loaded")
    except:
        checks.append("❌ Model loading failed")

    # Check 3: LoRA application
    try:
        trainable_params = model.num_parameters(only_trainable=True)
        total_params = model.num_parameters()
        ratio = trainable_params / total_params
        assert ratio < 0.1  # LoRA should train <10% of parameters
        checks.append(f"✅ LoRA applied ({ratio:.2%} trainable)")
    except:
        checks.append("❌ LoRA not properly applied")

    return checks


validation_results = validate_instruction_tuning()
for result in validation_results:
    print(result)


## 📋 本章小結

### ✅ 完成項目
- **指令數據處理**: Alpaca 格式解析、長度過濾、模板格式化
- **LoRA 微調流程**: 參數效率訓練、4bit 量化支援、低顯存優化
- **評估對比**: 微調前後指令跟隨能力比較、多案例測試
- **實用工具**: 可重用的格式化函數、煙霧測試、使用指南

### 🎯 核心原理要點
- **指令調優本質**: 教會模型遵循特定格式的指令，提升任務執行能力
- **LoRA 效率**: 只訓練 1-10% 參數即可達到接近全量微調的效果
- **數據質量重要性**: 高質量指令-回應對比隨機數據更重要
- **量化權衡**: 4bit 量化降低精度但大幅節省顯存，適合資源受限環境

### 🚀 下一步建議
1. **擴展到中文**: 使用中文指令數據集（如 BELLE、Chinese-Alpaca）
2. **QLoRA 進階**: 下一章實作 QLoRA 的 int4 量化微調技術
3. **評估指標**: 加入 ROUGE、BLEU、人工評估等量化指標
4. **多輪對話**: 處理對話歷史，支援上下文相關的指令跟隨

**何時使用指令調優**: 當基礎模型無法很好地遵循特定格式指令，或需要針對特定領域任務優化時，指令調優是最直接有效的方法。