In [None]:
# nb11_instruction_tuning_demo.ipynb
# Êåá‰ª§Ë™øÂÑ™Á§∫ÁØÑ - Instruction Tuning with LoRA/QLoRA

# === Cell 1: Shared Cache Bootstrap & Environment Setup ===
import os, pathlib, torch, warnings

warnings.filterwarnings("ignore", category=UserWarning)

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}
for k, v in paths.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB"
    )

In [None]:
# Check required packages
try:
    import transformers, datasets, peft, bitsandbytes
    from transformers import (
        AutoTokenizer,
        AutoModelForCausalLM,
        TrainingArguments,
        Trainer,
    )
    from peft import LoraConfig, get_peft_model, TaskType, PeftModel
    from datasets import load_dataset
    import json, random

    print("‚úÖ All required packages imported successfully")
except ImportError as e:
    print(f"‚ùå Missing package: {e}")
    print(
        "Install with: pip install transformers datasets peft bitsandbytes accelerate"
    )

In [None]:
# === Cell 2: Load and Explore Instruction Dataset ===
print("üîç Loading instruction dataset...")

# Load Alpaca-style instruction dataset (Stanford Alpaca or similar)
try:
    # Option 1: Stanford Alpaca (English)
    dataset = load_dataset("tatsu-lab/alpaca", split="train")
    print(f"‚úÖ Loaded Stanford Alpaca dataset: {len(dataset)} examples")
except:
    try:
        # Option 2: Databricks Dolly (alternative)
        dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
        print(f"‚úÖ Loaded Databricks Dolly dataset: {len(dataset)} examples")
    except:
        # Option 3: Chinese instruction dataset (fallback)
        print("‚ö†Ô∏è Using synthetic examples (original datasets unavailable)")
        synthetic_data = [
            {
                "instruction": "Ë´ãËß£Èáã‰ªÄÈ∫ºÊòØÊ©üÂô®Â≠∏Áøí",
                "input": "",
                "output": "Ê©üÂô®Â≠∏ÁøíÊòØ‰∫∫Â∑•Êô∫ÊÖßÁöÑ‰∏ÄÂÄãÂàÜÊîØÔºåÂÆÉ‰ΩøË®àÁÆóÊ©üÁ≥ªÁµ±ËÉΩÂ§†ÈÄöÈÅéÁ∂ìÈ©óËá™ÂãïÂ≠∏ÁøíÂíåÊîπÈÄ≤ÔºåËÄåÁÑ°ÈúÄË¢´ÊòéÁ¢∫Á∑®Á®ã„ÄÇ",
            },
            {
                "instruction": "Â∞á‰ª•‰∏ãÂè•Â≠êÁøªË≠ØÊàêËã±Êñá",
                "input": "‰ªäÂ§©Â§©Ê∞£ÂæàÂ•Ω",
                "output": "The weather is very nice today.",
            },
            {
                "instruction": "ÂàóÂá∫‰∏âÂÄãÁ®ãÂºèË®≠Ë®àÁöÑÊúÄ‰Ω≥ÂØ¶Ë∏ê",
                "input": "",
                "output": "1. ÂØ´Ê∏ÖÊô∞ÁöÑË®ªËß£ÂíåÊñáÊ™î\n2. ‰ΩøÁî®ÊúâÊÑèÁæ©ÁöÑËÆäÊï∏ÂíåÂáΩÊï∏ÂêçÁ®±\n3. ‰øùÊåÅ‰ª£Á¢ºÁ∞°ÊΩîÂíåÊ®°ÁµÑÂåñ",
            },
        ]
        from datasets import Dataset

        dataset = Dataset.from_list(synthetic_data * 100)  # Repeat for demo
        print(f"‚úÖ Created synthetic dataset: {len(dataset)} examples")

# Explore dataset structure
print("\nüìä Dataset Structure:")
print(f"Columns: {dataset.column_names}")
print(f"First example:")
example = dataset[0]
for key, value in example.items():
    print(f"  {key}: {repr(value[:100] + '...' if len(str(value)) > 100 else value)}")

# Basic statistics
instructions_with_input = sum(1 for item in dataset if item.get("input", "").strip())
print(f"\nüìà Dataset Statistics:")
print(f"Total examples: {len(dataset)}")
print(f"Examples with input: {instructions_with_input}")
print(f"Examples without input: {len(dataset) - instructions_with_input}")

In [None]:
# === Cell 3: Data Preprocessing and Formatting ===
print("üîß Preprocessing instruction data...")


def format_instruction(example):
    """
    Format instruction data into a single text for training.
    Uses Alpaca-style prompt template.
    """
    instruction = example["instruction"].strip()
    input_text = example.get("input", "").strip()
    output = example["output"].strip()

    if input_text:
        # Instruction with input
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
    else:
        # Instruction only
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"

    # Full text for training (prompt + response)
    full_text = prompt + output

    return {
        "prompt": prompt,
        "response": output,
        "full_text": full_text,
        "length": len(full_text),
    }


# Apply formatting
formatted_dataset = dataset.map(format_instruction)

# Filter by length (avoid too long sequences)
MAX_LENGTH = 512  # Adjust based on your GPU memory
filtered_dataset = formatted_dataset.filter(lambda x: x["length"] <= MAX_LENGTH)

print(
    f"‚úÖ Formatted dataset: {len(formatted_dataset)} ‚Üí {len(filtered_dataset)} examples (after length filtering)"
)

# Show example
print("\nüìù Formatted Example:")
example = filtered_dataset[0]
print("Prompt:")
print(example["prompt"])
print("Response:")
print(example["response"])
print(f"Total length: {example['length']} chars")

In [None]:
# === Cell 4: Load Base Model and Baseline Test ===
print("ü§ñ Loading base model for instruction tuning...")

# Model selection based on available VRAM
MODEL_NAME = "microsoft/DialoGPT-small"  # Lightweight for demo
# Alternative: "Qwen/Qwen2.5-0.5B-Instruct" or "google/flan-t5-small"

# Configure 4-bit quantization for low VRAM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=(
        torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    ),
)

print(f"Loading model: {MODEL_NAME}")
try:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config if torch.cuda.is_available() else None,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True,
    )

    print(f"‚úÖ Model loaded successfully")
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Falling back to CPU-only mode...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


# Test baseline performance
def test_instruction_following(model, tokenizer, prompt, max_length=100):
    """Test model's instruction following capability"""
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + max_length,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated[len(prompt) :].strip()


# Baseline test
test_prompt = "### Instruction:\nExplain what is machine learning in simple terms.\n\n### Response:\n"
baseline_response = test_instruction_following(model, tokenizer, test_prompt)
print(f"\nüß™ Baseline Test:")
print(f"Prompt: {test_prompt}")
print(f"Response: {baseline_response}")

In [None]:
# === Cell 5: LoRA Configuration and Training Setup ===
print("‚öôÔ∏è Setting up LoRA for parameter-efficient fine-tuning...")

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,  # Low rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Attention layers
    # Note: target_modules may vary by model architecture
)

# Apply LoRA to model
try:
    model = get_peft_model(model, lora_config)
    print("‚úÖ LoRA applied successfully")
    print(f"Trainable parameters: {model.num_parameters(only_trainable=True):,}")
    print(f"Total parameters: {model.num_parameters():,}")
    print(
        f"Trainable %: {100 * model.num_parameters(only_trainable=True) / model.num_parameters():.2f}%"
    )
except Exception as e:
    print(f"‚ö†Ô∏è Error applying LoRA: {e}")
    print(
        "This might be due to model architecture. Continuing with full fine-tuning..."
    )

# Data collator for language modeling
from transformers import DataCollatorForLanguageModeling


def tokenize_function(examples):
    """Tokenize the full text for training"""
    return tokenizer(
        examples["full_text"], truncation=True, padding=False, max_length=MAX_LENGTH
    )


# Tokenize dataset
train_dataset = filtered_dataset.map(
    tokenize_function, batched=True, remove_columns=filtered_dataset.column_names
).shuffle(seed=42)

# Take a subset for quick demo (full dataset for real training)
train_dataset = train_dataset.select(range(min(100, len(train_dataset))))

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal language modeling
)

print(f"‚úÖ Prepared training dataset: {len(train_dataset)} examples")

In [None]:
# === Cell 6: Lightweight Fine-tuning Execution ===
print("üöÄ Starting instruction fine-tuning...")

# Training arguments (conservative for demo)
training_args = TrainingArguments(
    output_dir=f"{AI_CACHE_ROOT}/instruction_tuning_demo",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Very short for demo
    per_device_train_batch_size=1,  # Low VRAM friendly
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # Conservative learning rate
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",  # Don't save checkpoints for demo
    dataloader_drop_last=False,
    fp16=torch.cuda.is_available(),  # Mixed precision if available
    report_to=None,  # Disable wandb/tensorboard
    remove_unused_columns=False,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("üìä Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(
    f"  Total steps: ~{len(train_dataset) // training_args.per_device_train_batch_size}"
)

# Start training
try:
    print("\nüéØ Starting training...")
    trainer.train()
    print("‚úÖ Training completed successfully!")

    # Save LoRA adapter
    if hasattr(model, "save_pretrained"):
        adapter_path = f"{AI_CACHE_ROOT}/instruction_lora_adapter"
        model.save_pretrained(adapter_path)
        print(f"üíæ LoRA adapter saved to: {adapter_path}")

except Exception as e:
    print(f"‚ö†Ô∏è Training error: {e}")
    print(
        "This might be due to memory constraints. Try reducing batch size or sequence length."
    )


In [None]:
# === Cell 7: Post-Training Evaluation and Comparison ===
print("üìà Evaluating fine-tuned model...")

# Test the same prompt as baseline
tuned_response = test_instruction_following(model, tokenizer, test_prompt)

print("üîÑ Before vs After Comparison:")
print("=" * 50)
print("BASELINE (pre-training):")
print(baseline_response)
print("\n" + "=" * 50)
print("FINE-TUNED (post-training):")
print(tuned_response)
print("=" * 50)

# Additional test cases
test_cases = [
    "### Instruction:\nList three benefits of exercise.\n\n### Response:\n",
    "### Instruction:\nWrite a haiku about technology.\n\n### Response:\n",
    "### Instruction:\nExplain why the sky is blue.\n\n### Response:\n",
]

print("\nüß™ Additional Test Cases:")
for i, prompt in enumerate(test_cases):
    print(f"\nTest {i+1}:")
    print(
        f"Instruction: {prompt.split('Response:')[0].replace('### Instruction:', '').replace('###', '').strip()}"
    )
    response = test_instruction_following(model, tokenizer, prompt, max_length=80)
    print(f"Response: {response}")

# Qualitative assessment checklist
print("\n‚úÖ Qualitative Assessment Checklist:")
print("1. Does the model follow instructions more consistently? [Manual check]")
print("2. Are responses more structured and coherent? [Manual check]")
print("3. Does the model use appropriate formatting? [Manual check]")
print("4. Is the model still safe and not generating harmful content? [Manual check]")

In [None]:
# === Cell 8: Smoke Test and Usage Notes ===
print("üß™ Final Smoke Test...")


def smoke_test():
    """Quick validation that everything works"""
    try:
        # Test tokenizer
        test_text = "Hello world"
        tokens = tokenizer(test_text, return_tensors="pt")
        assert tokens["input_ids"].shape[1] > 0

        # Test model inference
        with torch.no_grad():
            outputs = model(**tokens)
            assert outputs.logits.shape[-1] == tokenizer.vocab_size

        # Test instruction format
        formatted = format_instruction(
            {"instruction": "Test instruction", "input": "", "output": "Test output"}
        )
        assert "### Instruction:" in formatted["full_text"]
        assert "### Response:" in formatted["full_text"]

        print("‚úÖ All smoke tests passed!")
        return True

    except Exception as e:
        print(f"‚ùå Smoke test failed: {e}")
        return False


smoke_test_result = smoke_test()

print("\nüìö Usage Notes and Next Steps:")
print(
    """
üéØ What we accomplished:
- Loaded and formatted instruction dataset (Alpaca-style)
- Applied LoRA for parameter-efficient fine-tuning
- Trained model on instruction-following tasks
- Compared before/after performance

üîß Key Parameters:
- LoRA rank: 16 (balance between efficiency and capacity)
- Learning rate: 2e-4 (conservative to avoid catastrophic forgetting)
- Max sequence length: 512 (adjust based on your needs)

‚ö†Ô∏è Common Issues:
- OOM errors: Reduce batch_size, use gradient_checkpointing
- Poor convergence: Increase epochs, check learning rate
- Quality issues: Need more diverse training data

üöÄ Next Steps:
- Scale up to larger datasets (10K+ examples)
- Try different base models (Qwen2.5, Yi, Llama)
- Experiment with QLoRA for even lower memory usage
- Add evaluation metrics (ROUGE, BLEU, human eval)
- Deploy as a chat interface

üí° Production Tips:
- Use validation set to prevent overfitting
- Monitor loss curves and stop early if needed
- Consider multi-epoch training with lr scheduling
- Save multiple checkpoints for comparison
"""
)

print(f"\nüéâ nb11 Instruction Tuning Demo completed!")
print(f"üíæ Model artifacts saved to: {AI_CACHE_ROOT}/")
print(f"üîß Smoke test status: {'PASSED' if smoke_test_result else 'FAILED'}")

In [None]:
# Quick validation that instruction tuning setup works correctly
def validate_instruction_tuning():
    checks = []

    # Check 1: Dataset formatting
    try:
        sample = format_instruction(
            {"instruction": "Test", "input": "", "output": "Response"}
        )
        assert "### Instruction:" in sample["full_text"]
        checks.append("‚úÖ Dataset formatting works")
    except:
        checks.append("‚ùå Dataset formatting failed")

    # Check 2: Model loading with quantization
    try:
        assert model is not None
        assert tokenizer is not None
        checks.append("‚úÖ Model and tokenizer loaded")
    except:
        checks.append("‚ùå Model loading failed")

    # Check 3: LoRA application
    try:
        trainable_params = model.num_parameters(only_trainable=True)
        total_params = model.num_parameters()
        ratio = trainable_params / total_params
        assert ratio < 0.1  # LoRA should train <10% of parameters
        checks.append(f"‚úÖ LoRA applied ({ratio:.2%} trainable)")
    except:
        checks.append("‚ùå LoRA not properly applied")

    return checks


validation_results = validate_instruction_tuning()
for result in validation_results:
    print(result)


## üìã Êú¨Á´†Â∞èÁµê

### ‚úÖ ÂÆåÊàêÈ†ÖÁõÆ
- **Êåá‰ª§Êï∏ÊìöËôïÁêÜ**: Alpaca Ê†ºÂºèËß£Êûê„ÄÅÈï∑Â∫¶ÈÅéÊøæ„ÄÅÊ®°ÊùøÊ†ºÂºèÂåñ
- **LoRA ÂæÆË™øÊµÅÁ®ã**: ÂèÉÊï∏ÊïàÁéáË®ìÁ∑¥„ÄÅ4bit ÈáèÂåñÊîØÊè¥„ÄÅ‰ΩéÈ°ØÂ≠òÂÑ™Âåñ
- **Ë©ï‰º∞Â∞çÊØî**: ÂæÆË™øÂâçÂæåÊåá‰ª§Ë∑üÈö®ËÉΩÂäõÊØîËºÉ„ÄÅÂ§öÊ°à‰æãÊ∏¨Ë©¶
- **ÂØ¶Áî®Â∑•ÂÖ∑**: ÂèØÈáçÁî®ÁöÑÊ†ºÂºèÂåñÂáΩÊï∏„ÄÅÁÖôÈúßÊ∏¨Ë©¶„ÄÅ‰ΩøÁî®ÊåáÂçó

### üéØ Ê†∏ÂøÉÂéüÁêÜË¶ÅÈªû
- **Êåá‰ª§Ë™øÂÑ™Êú¨Ë≥™**: ÊïôÊúÉÊ®°ÂûãÈÅµÂæ™ÁâπÂÆöÊ†ºÂºèÁöÑÊåá‰ª§ÔºåÊèêÂçá‰ªªÂãôÂü∑Ë°åËÉΩÂäõ
- **LoRA ÊïàÁéá**: Âè™Ë®ìÁ∑¥ 1-10% ÂèÉÊï∏Âç≥ÂèØÈÅîÂà∞Êé•ËøëÂÖ®ÈáèÂæÆË™øÁöÑÊïàÊûú
- **Êï∏ÊìöË≥™ÈáèÈáçË¶ÅÊÄß**: È´òË≥™ÈáèÊåá‰ª§-ÂõûÊáâÂ∞çÊØîÈö®Ê©üÊï∏ÊìöÊõ¥ÈáçË¶Å
- **ÈáèÂåñÊ¨äË°°**: 4bit ÈáèÂåñÈôç‰ΩéÁ≤æÂ∫¶‰ΩÜÂ§ßÂπÖÁØÄÁúÅÈ°ØÂ≠òÔºåÈÅ©ÂêàË≥áÊ∫êÂèóÈôêÁí∞Â¢É

### üöÄ ‰∏ã‰∏ÄÊ≠•Âª∫Ë≠∞
1. **Êì¥Â±ïÂà∞‰∏≠Êñá**: ‰ΩøÁî®‰∏≠ÊñáÊåá‰ª§Êï∏ÊìöÈõÜÔºàÂ¶Ç BELLE„ÄÅChinese-AlpacaÔºâ
2. **QLoRA ÈÄ≤Èöé**: ‰∏ã‰∏ÄÁ´†ÂØ¶‰Ωú QLoRA ÁöÑ int4 ÈáèÂåñÂæÆË™øÊäÄË°ì
3. **Ë©ï‰º∞ÊåáÊ®ô**: Âä†ÂÖ• ROUGE„ÄÅBLEU„ÄÅ‰∫∫Â∑•Ë©ï‰º∞Á≠âÈáèÂåñÊåáÊ®ô
4. **Â§öËº™Â∞çË©±**: ËôïÁêÜÂ∞çË©±Ê≠∑Âè≤ÔºåÊîØÊè¥‰∏ä‰∏ãÊñáÁõ∏ÈóúÁöÑÊåá‰ª§Ë∑üÈö®

**‰ΩïÊôÇ‰ΩøÁî®Êåá‰ª§Ë™øÂÑ™**: Áï∂Âü∫Á§éÊ®°ÂûãÁÑ°Ê≥ïÂæàÂ•ΩÂú∞ÈÅµÂæ™ÁâπÂÆöÊ†ºÂºèÊåá‰ª§ÔºåÊàñÈúÄË¶ÅÈáùÂ∞çÁâπÂÆöÈ†òÂüü‰ªªÂãôÂÑ™ÂåñÊôÇÔºåÊåá‰ª§Ë™øÂÑ™ÊòØÊúÄÁõ¥Êé•ÊúâÊïàÁöÑÊñπÊ≥ï„ÄÇ