In [None]:
print('Setup complete.')

# Lab 03: Fine-Tuning Techniques & PEFT

## Learning Objectives
- Understand the difference between full fine-tuning and parameter-efficient fine-tuning (PEFT)
- Implement Low-Rank Adaptation (LoRA), a popular PEFT method
- Compare the number of trainable parameters between full tuning and LoRA
- Apply LoRA to a mock model

## Setup

In [None]:
import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass, field

## Part 1: Full Fine-Tuning vs. PEFT

**Full Fine-Tuning** involves updating all the weights of a pre-trained model. While effective, it has significant drawbacks:
- **High Computational Cost**: Requires a lot of memory and processing power.
- **Large Storage Needs**: A full copy of the model must be saved for each task.
- **Catastrophic Forgetting**: The model may lose some of its general capabilities.

**Parameter-Efficient Fine-Tuning (PEFT)** methods update only a small subset of the model's parameters. This offers several advantages:
- **Efficiency**: Drastically reduces memory and computational requirements.
- **Small Footprint**: Only the small number of updated parameters need to be saved.
- **Reduces Forgetting**: The original pre-trained weights are frozen, preserving general knowledge.

## Part 2: Low-Rank Adaptation (LoRA)

In [None]:
class LoRALayer:
    """Implements a LoRA layer that wraps a linear (weight) layer."""
    def __init__(self, original_weights: np.ndarray, rank: int):
        self.original_weights = original_weights
        self.rank = rank
        d, k = original_weights.shape
        
        # LoRA's low-rank matrices
        self.A = np.random.randn(d, rank) * 0.01 # (d, r)
        self.B = np.zeros((rank, k))             # (r, k)
        
        # The original weights are frozen
        self.original_weights.setflags(write=False)
        
    @property
    def combined_weights(self) -> np.ndarray:
        # The core idea: W_0 + B * A
        return self.original_weights + np.dot(self.A, self.B)
        
    def trainable_parameters(self) -> int:
        return self.A.size + self.B.size

# Let's create a mock weight matrix for a large linear layer
d_model = 4096 # Dimension of the model
vocab_size = 32000 # Size of the vocabulary

original_weight_matrix = np.random.randn(d_model, vocab_size)

# Now, let's apply LoRA
lora_rank = 8 # A small rank
lora_layer = LoRALayer(original_weight_matrix, rank=lora_rank)

# Compare the number of parameters
full_params = original_weight_matrix.size
lora_params = lora_layer.trainable_parameters()

print(f'--- Parameter Comparison ---")
print(f'Original Full Layer Parameters: {full_params:,}')
print(f'LoRA (r={lora_rank}) Trainable Parameters: {lora_params:,}')
print(f'Reduction Factor: {full_params / lora_params:.2f}x')
print(f'LoRA uses {lora_params / full_params:.4%} of the original parameters.')

## Part 3: Applying LoRA to a Mock Model

In [None]:
class MockLoRAModel:
    """A mock model with a LoRA layer."""
    def __init__(self, vocab_size=256, dim=32, lora_rank=4):
        # The main weight matrix of the model
        original_weights = np.random.randn(dim, vocab_size) * 0.1
        self.lora_layer = LoRALayer(original_weights, rank=lora_rank)

    def forward(self, input_tokens: List[int]) -> np.ndarray:
        # The forward pass uses the combined weights
        combined_w = self.lora_layer.combined_weights
        input_vectors = np.array([combined_w[:, token] for token in input_tokens])
        avg_vector = np.mean(input_vectors, axis=0)
        logits = np.dot(avg_vector, combined_w)
        return logits

def lora_fine_tune_loop(model: MockLoRAModel, dataset, tokenizer, epochs, lr):
    """Simplified fine-tuning loop that only updates LoRA matrices A and B."""
    for epoch in range(epochs):
        # In a real implementation, backpropagation would compute gradients for A and B.
        # We simulate this by applying random updates to A and B.
        grad_A = np.random.randn(*model.lora_layer.A.shape) * 0.01
        grad_B = np.random.randn(*model.lora_layer.B.shape) * 0.01
        
        model.lora_layer.A -= lr * grad_A
        model.lora_layer.B -= lr * grad_B
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs} - LoRA weights updated.')

# Initialize model
lora_model = MockLoRAModel()

# Check that original weights are not changed
original_weights_before = lora_model.lora_layer.original_weights.copy()

lora_fine_tune_loop(lora_model, [], None, epochs=50, lr=0.01)

original_weights_after = lora_model.lora_layer.original_weights

print("
--- Weight Integrity Check ---")
print(f'Original weights remain unchanged: {np.allclose(original_weights_before, original_weights_after)}')

## Part 4: Merging LoRA Weights

In [None]:
def merge_lora_weights(lora_layer: LoRALayer) -> np.ndarray:
    """Merges the LoRA weights into the original weights for inference."""
    return lora_layer.combined_weights

print("--- Merging for Inference ---")
print(f'Shape of original weights: {lora_model.lora_layer.original_weights.shape}')

# After training, you can merge the weights
merged_weights = merge_lora_weights(lora_model.lora_layer)
print(f'Shape of merged weights: {merged_weights.shape}')

# The new merged model can be used for inference without the LoRA overhead.
# This means inference speed is identical to the original model.
inference_model_weights = merged_weights
print("LoRA weights successfully merged for efficient inference.")

## Exercises

1. **Experiment with Rank**: Change the `lora_rank`. How does it affect the number of trainable parameters? What might be the trade-off between a very low rank (e.g., 1) and a higher rank (e.g., 64)?
2. **Implement LoRA for Multiple Layers**: Modify the `MockLoRAModel` to have multiple linear layers (e.g., `layer1`, `layer2`) and apply LoRA to each of them. How would you manage the different LoRA matrices?
3. **Save and Load LoRA Adapters**: Write functions to save only the LoRA matrices (`A` and `B`) to a file and then load them back into a `LoRALayer`. This demonstrates how lightweight PEFT adapters are.

## Summary

You learned:
- The key differences and trade-offs between full fine-tuning and PEFT.
- The mechanics of LoRA, a popular PEFT technique that uses low-rank matrices to adapt a model.
- How LoRA dramatically reduces the number of trainable parameters.
- How to merge LoRA weights back into the base model for efficient inference.