# Implementation: LoRA Layer

**Goal**: Inject training parameters.

In [None]:
import torch
import torch.nn as nn

class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, rank=8, alpha=16):
        super().__init__()
        # Frozen Base Layer
        self.linear = nn.Linear(in_features, out_features)
        self.linear.weight.requires_grad = False 
        self.linear.bias.requires_grad = False
        
        # Trainable Low-Rank Matrices
        self.lora_A = nn.Parameter(torch.randn(rank, in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank)) # Start at 0
        
        self.scaling = alpha / rank

    def forward(self, x):
        # Wx + (B @ A)x * scale
        base_out = self.linear(x)
        lora_out = (x @ self.lora_A.T) @ self.lora_B.T
        
        return base_out + lora_out * self.scaling

# 1. Setup
layer = LoRALinear(1024, 1024, rank=8)

# 2. Check Gradients
total_params = sum(p.numel() for p in layer.parameters())
trainable_params = sum(p.numel() for p in layer.parameters() if p.requires_grad)

print(f"Total Params: {total_params:,}")
print(f"Trainable Params: {trainable_params:,}")
print(f"Savings: {(1 - trainable_params/total_params)*100:.2f}%")

## Conclusion
We only train 1.5% of the parameters.