---
## 1. Setup & Imports

In [None]:
# Install required packages (run once)
# !pip install transformers datasets accelerate peft bitsandbytes trl pyyaml

In [None]:
import os
import sys
import random
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel,
    TaskType,
)
from datasets import Dataset, load_dataset

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    total_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"Memory: {total_mem:.2f} GB")

---
## 2. Configuration

In [None]:
# Configuration for Stage 3
CONFIG = {
    # Model - Use base model (LoRA trains from base, not checkpoint)
    "model_name": "Qwen/Qwen2.5-1.5B",
    
    # Quantization (set to False for standard LoRA)
    "use_qlora": True,  # Set to False for standard LoRA
    
    # LoRA Config
    "lora_r": 64,           # LoRA rank
    "lora_alpha": 128,      # LoRA alpha (typically 2x r)
    "lora_dropout": 0.05,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", 
                       "gate_proj", "up_proj", "down_proj"],
    
    # Data
    "max_length": 1024,
    "train_split": 0.9,
    "alpaca_subset": 1000,  # Use subset for demo
    
    # Training
    "batch_size": 4,
    "gradient_accumulation_steps": 4,
    "num_epochs": 3,
    "learning_rate": 2e-4,  # Higher LR for LoRA
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    
    # Output
    "output_dir": "../outputs/stage3_lora",
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

---
## 3. Load Model with Quantization (QLoRA)

üîë **QLoRA** = 4-bit quantized base model + LoRA adapters

This dramatically reduces memory usage while maintaining quality.

In [None]:
# Memory before loading
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["model_name"],
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer loaded: {CONFIG['model_name']}")

In [None]:
# Load model with optional quantization
if CONFIG["use_qlora"]:
    print("Loading model with QLoRA (4-bit quantization)...")
    
    # BitsAndBytes config for 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    
    # Prepare for k-bit training
    model = prepare_model_for_kbit_training(model)
    
else:
    print("Loading model for standard LoRA (no quantization)...")
    
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

print(f"Model loaded. Parameters: {model.num_parameters():,}")

if torch.cuda.is_available():
    print(f"GPU memory after loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

---
## 4. Apply LoRA Adapters

üîß LoRA only trains small adapter matrices, keeping the base model frozen.

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=CONFIG["target_modules"],
)

print("LoRA Config:")
print(f"  Rank (r): {lora_config.r}")
print(f"  Alpha: {lora_config.lora_alpha}")
print(f"  Dropout: {lora_config.lora_dropout}")
print(f"  Target modules: {lora_config.target_modules}")

In [None]:
# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params, all_params = model.get_nb_trainable_parameters()
print(f"\n{'='*60}")
print(f"PARAMETER EFFICIENCY")
print(f"{'='*60}")
print(f"All parameters:       {all_params:>15,}")
print(f"Trainable parameters: {trainable_params:>15,}")
print(f"Trainable %:          {100 * trainable_params / all_params:>14.2f}%")
print(f"{'='*60}")

if torch.cuda.is_available():
    print(f"\nGPU memory with LoRA: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [None]:
# Show model structure with LoRA layers
print("\nModel modules with LoRA:")
for name, module in model.named_modules():
    if "lora" in name.lower():
        print(f"  {name}")
        if hasattr(module, 'weight'):
            print(f"    Shape: {module.weight.shape}")

---
## 5. Prepare Training Data

Using same instruction format as Stage 2 for fair comparison.

In [None]:
# Instruction templates (same as Stage 2)
INSTRUCTION_TEMPLATES = [
    """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{response}""",

    """Instruction: {instruction}
{input}

Response: {response}""",

    """<|im_start|>system
You are a helpful assistant.
<|im_end|>
<|im_start|>user
{instruction}
{input}
<|im_end|>
<|im_start|>assistant
{response}
<|im_end|>""",
]

def format_sample(sample, template_idx=None):
    if template_idx is None:
        template_idx = random.randint(0, len(INSTRUCTION_TEMPLATES) - 1)
    
    template = INSTRUCTION_TEMPLATES[template_idx]
    
    return template.format(
        instruction=sample.get("instruction", ""),
        input=sample.get("input", ""),
        response=sample.get("response", sample.get("output", "")),
    )

In [None]:
# Load Alpaca dataset
print("Loading Alpaca dataset...")
alpaca_dataset = load_dataset("yahma/alpaca-cleaned", split="train")

data = []
for item in alpaca_dataset:
    data.append({
        "instruction": item["instruction"],
        "input": item["input"],
        "response": item["output"],
    })

# Use subset
if CONFIG["alpaca_subset"]:
    random.seed(42)
    random.shuffle(data)
    data = data[:CONFIG["alpaca_subset"]]

print(f"Loaded {len(data)} samples")

In [None]:
# Format and split data
formatted_texts = [format_sample(s) for s in data]

random.shuffle(formatted_texts)
split_idx = int(len(formatted_texts) * CONFIG["train_split"])
train_texts = formatted_texts[:split_idx]
eval_texts = formatted_texts[split_idx:]

print(f"Train: {len(train_texts)}, Eval: {len(eval_texts)}")

In [None]:
# Tokenize
def tokenize_texts(texts, tokenizer, max_length):
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt",
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_tokenized = tokenize_texts(train_texts, tokenizer, CONFIG["max_length"])
eval_tokenized = tokenize_texts(eval_texts, tokenizer, CONFIG["max_length"])

train_dataset = Dataset.from_dict({
    "input_ids": train_tokenized["input_ids"].tolist(),
    "attention_mask": train_tokenized["attention_mask"].tolist(),
    "labels": train_tokenized["labels"].tolist(),
})

eval_dataset = Dataset.from_dict({
    "input_ids": eval_tokenized["input_ids"].tolist(),
    "attention_mask": eval_tokenized["attention_mask"].tolist(),
    "labels": eval_tokenized["labels"].tolist(),
})

print(f"Train dataset: {train_dataset}")
print(f"Eval dataset: {eval_dataset}")

---
## 6. Setup Training

In [None]:
# Training arguments for LoRA
training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    
    num_train_epochs=CONFIG["num_epochs"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"],
    lr_scheduler_type="cosine",
    
    # Lower max_grad_norm for LoRA stability
    max_grad_norm=0.3,
    
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=2,
    
    bf16=True,
    
    # Use paged optimizer for memory efficiency
    optim="paged_adamw_8bit" if CONFIG["use_qlora"] else "adamw_torch",
    
    gradient_checkpointing=True,
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    report_to="none",
)

print("Training arguments configured.")

In [None]:
# Trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("Trainer initialized.")

---
## 7. Test Before Training

In [None]:
def generate_response(model, tokenizer, prompt, max_new_tokens=128):
    """Generate response from model."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test prompts
test_prompts = [
    """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the capital of France?

### Response:""",

    """Instruction: Explain quantum computing in simple terms.

Response:""",

    """<|im_start|>system
You are a helpful assistant.
<|im_end|>
<|im_start|>user
Write a short poem about coding.
<|im_end|>
<|im_start|>assistant
""",
]

print("=" * 60)
print("MODEL RESPONSES (Before LoRA Training)")
print("=" * 60)

before_responses = {}
for i, prompt in enumerate(test_prompts):
    print(f"\n--- Test {i+1} ---")
    response = generate_response(model, tokenizer, prompt)
    answer = response[len(prompt):].strip() if response.startswith(prompt) else response
    print(f"Response: {answer[:150]}..." if len(answer) > 150 else f"Response: {answer}")
    before_responses[i] = answer

---
## 8. Train with LoRA

üöÄ **Watch the memory usage** - it should be much lower than full fine-tuning!

In [None]:
# Memory before training
if torch.cuda.is_available():
    print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

In [None]:
# Train!
print("Starting LoRA training...")
print("="*60)

train_result = trainer.train()

print("="*60)
print("Training complete!")
print(f"Total steps: {train_result.global_step}")
print(f"Training loss: {train_result.training_loss:.4f}")

if torch.cuda.is_available():
    print(f"\nPeak GPU memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

In [None]:
# Evaluate
eval_results = trainer.evaluate()
print(f"Eval loss: {eval_results['eval_loss']:.4f}")

---
## 9. Test After Training

In [None]:
print("=" * 60)
print("MODEL RESPONSES (After LoRA Training)")
print("=" * 60)

after_responses = {}
for i, prompt in enumerate(test_prompts):
    print(f"\n--- Test {i+1} ---")
    response = generate_response(model, tokenizer, prompt)
    answer = response[len(prompt):].strip() if response.startswith(prompt) else response
    print(f"Response: {answer[:150]}..." if len(answer) > 150 else f"Response: {answer}")
    after_responses[i] = answer

In [None]:
# Compare
print("=" * 60)
print("COMPARISON: Before vs After LoRA Training")
print("=" * 60)

for i in range(len(test_prompts)):
    print(f"\n--- Test {i+1} ---")
    print(f"Before: {before_responses[i][:100]}..." if len(before_responses[i]) > 100 else f"Before: {before_responses[i]}")
    print(f"After:  {after_responses[i][:100]}..." if len(after_responses[i]) > 100 else f"After:  {after_responses[i]}")

---
## 10. Save LoRA Adapters

In [None]:
# Save LoRA adapters only (small files!)
adapter_path = Path(CONFIG["output_dir"]) / "adapter"
adapter_path.mkdir(parents=True, exist_ok=True)

model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)

print(f"LoRA adapters saved to: {adapter_path}")

# Check adapter size
import os
total_size = sum(os.path.getsize(f) for f in adapter_path.rglob("*") if f.is_file())
print(f"Adapter size: {total_size / 1e6:.2f} MB")

---
## 11. Merge LoRA with Base Model

For deployment, you can merge the LoRA adapters back into the base model.

In [None]:
# Merge LoRA adapters with base model
print("Merging LoRA adapters with base model...")

merged_model = model.merge_and_unload()

print(f"Merged model parameters: {merged_model.num_parameters():,}")

In [None]:
# Test merged model
print("=" * 60)
print("MERGED MODEL TEST")
print("=" * 60)

for i, prompt in enumerate(test_prompts):
    print(f"\n--- Test {i+1} ---")
    response = generate_response(merged_model, tokenizer, prompt)
    answer = response[len(prompt):].strip() if response.startswith(prompt) else response
    print(f"Response: {answer[:150]}")

In [None]:
# Save merged model
merged_path = Path(CONFIG["output_dir"]) / "merged"
merged_path.mkdir(parents=True, exist_ok=True)

merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

print(f"Merged model saved to: {merged_path}")

---
## 12. Load LoRA Adapter (For Inference)

Demonstration of how to load a saved LoRA adapter.

In [None]:
# How to load LoRA adapter for inference
print("Demonstrating LoRA adapter loading...")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Load LoRA adapter
lora_model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
)

print("LoRA adapter loaded successfully!")

# Test
test_prompt = test_prompts[0]
response = generate_response(lora_model, tokenizer, test_prompt)
answer = response[len(test_prompt):].strip() if response.startswith(test_prompt) else response
print(f"\nTest response: {answer[:100]}")

---
## 13. Memory Comparison Summary

In [None]:
# Summary
print("=" * 60)
print("STAGE 3 SUMMARY - LoRA/QLoRA")
print("=" * 60)

print(f"\nüìä Configuration:")
print(f"   Model: {CONFIG['model_name']}")
print(f"   QLoRA (4-bit): {CONFIG['use_qlora']}")
print(f"   LoRA rank: {CONFIG['lora_r']}")
print(f"   LoRA alpha: {CONFIG['lora_alpha']}")

print(f"\nüìà Parameter Efficiency:")
print(f"   Total parameters: {all_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Trainable %: {100 * trainable_params / all_params:.2f}%")

if torch.cuda.is_available():
    print(f"\nüíæ Memory Usage:")
    print(f"   Peak GPU memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

print(f"\nüìÅ Saved Files:")
print(f"   Adapter: {adapter_path}")
print(f"   Merged: {merged_path}")
print(f"   Adapter size: {total_size / 1e6:.2f} MB")

---
## ‚úÖ Stage 3 Complete!

### What we achieved:
- ‚úÖ **Dramatically reduced memory usage** with QLoRA
- ‚úÖ **Only ~2-5% of parameters trained** (LoRA adapters)
- ‚úÖ **Similar quality** to full fine-tuning
- ‚úÖ **Tiny adapter files** for easy deployment
- ‚úÖ **Can merge adapters** into base model

### Key Benefits:
- üöÄ Can fine-tune larger models on smaller GPUs
- üí∞ Much cheaper training
- üîí Base model knowledge preserved
- üì¶ Easy to swap adapters for different tasks

### Next Step: Full Evaluation
Compare all three stages on the same test queries!

---