# imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import copy
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader

# Hyperparameters
random_seed = 123

torch.manual_seed(random_seed)

<torch._C.Generator at 0x7f7adffc9430>

# Exercise 1: Implementing the LoRALayer

In [2]:
class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–∞—Ç—Ä–∏—Ü—É A —Å –Ω–æ—Ä–º–∞–ª—å–Ω—ã–º —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ–º, –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–Ω—ã–º 1/sqrt(rank)
        # –≠—Ç–æ –ø–æ–º–æ–≥–∞–µ—Ç –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞—Ç—å –Ω–æ—Ä–º—É –∞–∫—Ç–∏–≤–∞—Ü–∏–π
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–∞—Ç—Ä–∏—Ü—É B –Ω—É–ª—è–º–∏
        # –≠—Ç–æ –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç, —á—Ç–æ –≤ –Ω–∞—á–∞–ª–µ –∞–¥–∞–ø—Ç–∞—Ü–∏—è LoRA —Ä–∞–≤–Ω–∞ –Ω—É–ª—é,
        # –∏ –º–æ–¥–µ–ª—å –Ω–∞—á–∏–Ω–∞–µ—Ç –æ–±—É—á–µ–Ω–∏–µ —Å –∏—Å—Ö–æ–¥–Ω—ã—Ö –≤–µ—Å–æ–≤.
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        # alpha - —ç—Ç–æ –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è –¥–ª—è LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏–∏
        # –û–Ω –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –¥–ª—è —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è –≤–∫–ª–∞–¥–æ–º LoRA –≤ –≤—ã—Ö–æ–¥–Ω–æ–π —Å–∏–≥–Ω–∞–ª.
        self.alpha = alpha
        # –†–∞–Ω–≥ - —ç—Ç–æ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä, –æ–ø—Ä–µ–¥–µ–ª—è—é—â–∏–π —Ä–∞–∑–º–µ—Ä –Ω–∏–∑–∫–æ—Ä–∞–Ω–≥–æ–≤—ã—Ö –º–∞—Ç—Ä–∏—Ü.
        # –ë–æ–ª–µ–µ –Ω–∏–∑–∫–∏–π —Ä–∞–Ω–≥ –æ–∑–Ω–∞—á–∞–µ—Ç –º–µ–Ω—å—à–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤, –Ω–æ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ –º–µ–Ω—å—à—É—é –≤—ã—Ä–∞–∑–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å.
        self.rank = rank

    def forward(self, x):
        # –í—ã—á–∏—Å–ª—è–µ–º LoRA —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏—é: x @ A @ B
        # –ó–∞—Ç–µ–º –º–∞—Å—à—Ç–∞–±–∏—Ä—É–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –Ω–∞ (alpha / rank)
        # –î–µ–ª–µ–Ω–∏–µ –Ω–∞ rank –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –¥–ª—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –∏–∑–º–µ–Ω–µ–Ω–∏—è –º–∞—Å—à—Ç–∞–±–∞ –ø—Ä–∏ –∏–∑–º–µ–Ω–µ–Ω–∏–∏ rank.
        x = (x @ self.A @ self.B) * (self.alpha / self.rank)
        return x

# –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ LoRALayer
print("--- Exercise 1: LoRALayer ---")
in_features_test = 10
out_features_test = 5
rank_test = 4
alpha_test = 8
lora_layer_test = LoRALayer(in_features_test, out_features_test, rank_test, alpha_test)
input_tensor_test = torch.randn(1, in_features_test) # –ë–∞—Ç—á –∏–∑ 1 —ç–ª–µ–º–µ–Ω—Ç–∞
output_lora_test = lora_layer_test(input_tensor_test)
print(f"LoRALayer Input Shape: {input_tensor_test.shape}")
print(f"LoRALayer Output Shape: {output_lora_test.shape}")
print(f"LoRALayer Output (first 5 values): {output_lora_test.flatten()[:5]}")
print("----------------------------\n")

--- Exercise 1: LoRALayer ---
LoRALayer Input Shape: torch.Size([1, 10])
LoRALayer Output Shape: torch.Size([1, 5])
LoRALayer Output (first 5 values): tensor([0., 0., 0., 0., 0.], grad_fn=<SliceBackward0>)
----------------------------



# Exercise 2: Implementing the LinearWithLoRA Layer


In [3]:
# Objective: Extend a standard PyTorch Linear layer to incorporate the LoRALayer for adaptable training.

class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Å—Ö–æ–¥–Ω—ã–π –ª–∏–Ω–µ–π–Ω—ã–π —Å–ª–æ–π
        self.linear = linear
        # –°–æ–∑–¥–∞–µ–º —ç–∫–∑–µ–º–ø–ª—è—Ä LoRALayer, –∏—Å–ø–æ–ª—å–∑—É—è in_features –∏ out_features –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –ª–∏–Ω–µ–π–Ω–æ–≥–æ —Å–ª–æ—è
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        # –í—ã—Ö–æ–¥–Ω–æ–π —Å–∏–≥–Ω–∞–ª - —ç—Ç–æ —Å—É–º–º–∞ –≤—ã—Ö–æ–¥–∞ –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –ª–∏–Ω–µ–π–Ω–æ–≥–æ —Å–ª–æ—è
        # –∏ –≤—ã—Ö–æ–¥–∞ LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏–∏.
        return self.linear(x) + self.lora(x)

# –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ LinearWithLoRA
print("--- Exercise 2: LinearWithLoRA ---")
linear_layer_orig = nn.Linear(in_features_test, out_features_test)
linear_with_lora_test = LinearWithLoRA(linear_layer_orig, rank_test, alpha_test)
output_linear_with_lora_test = linear_with_lora_test(input_tensor_test)
print(f"LinearWithLoRA Input Shape: {input_tensor_test.shape}")
print(f"LinearWithLoRA Output Shape: {output_linear_with_lora_test.shape}")
print(f"LinearWithLoRA Output (first 5 values): {output_linear_with_lora_test.flatten()[:5]}")
print("----------------------------\n")

--- Exercise 2: LinearWithLoRA ---
LinearWithLoRA Input Shape: torch.Size([1, 10])
LinearWithLoRA Output Shape: torch.Size([1, 5])
LinearWithLoRA Output (first 5 values): tensor([-0.3074,  0.4623, -0.6323,  0.1641,  0.1358], grad_fn=<SliceBackward0>)
----------------------------



# Exercise 3: Creating a Small Neural Network and Applying LoRA

In [10]:
# Objective: Implement a simple feedforward neural network and apply LoRA to one of its layers.

print("--- Exercise 3: Applying LoRA to a Single Layer ---")
# –û–ø—Ä–µ–¥–µ–ª—è–µ–º –ø—Ä–æ—Å—Ç–æ–π –ª–∏–Ω–µ–π–Ω—ã–π —Å–ª–æ–π
layer = nn.Linear(in_features=10, out_features=5)
# –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —Å–ª—É—á–∞–π–Ω—ã–π –≤—Ö–æ–¥–Ω–æ–π —Ç–µ–Ω–∑–æ—Ä
x = torch.randn(1, 10)

print(f"Original Input: {x}")
print(f"Original Linear Layer: {layer}")
original_output = layer(x)
print('Original output:', original_output)

# –ü—Ä–∏–º–µ–Ω—è–µ–º LoRA –∫ –ª–∏–Ω–µ–π–Ω–æ–º—É —Å–ª–æ—é, –∑–∞–º–µ–Ω—è—è –µ–≥–æ –Ω–∞ LinearWithLoRA
# –ú—ã –∏—Å–ø–æ–ª—å–∑—É–µ–º —Ç–µ –∂–µ rank –∏ alpha, —á—Ç–æ –∏ —Ä–∞–Ω–µ–µ, –∏–ª–∏ –º–æ–∂–µ–º –æ–ø—Ä–µ–¥–µ–ª–∏—Ç—å –Ω–æ–≤—ã–µ.
# –ó–¥–µ—Å—å –≤–∞–∂–Ω–æ, —á—Ç–æ –ø—Ä–∏ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏ LoRA.B –Ω—É–ª—è–º–∏, –Ω–∞—á–∞–ª—å–Ω—ã–π –≤—ã—Ö–æ–¥ LoRA –±—É–¥–µ—Ç –Ω—É–ª–µ–≤—ã–º,
# –∏, —Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ, –≤—ã—Ö–æ–¥ LinearWithLoRA –±—É–¥–µ—Ç –∏–¥–µ–Ω—Ç–∏—á–µ–Ω –≤—ã—Ö–æ–¥—É –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–≥–æ Linear —Å–ª–æ—è.
layer_lora_1 = LinearWithLoRA(layer, rank=4, alpha=8)
lora_applied_output = layer_lora_1(x)
print(f"\nLayer with LoRA Applied: {layer_lora_1}")
print('Output after applying LoRA (should be very close to original due to zero-initialized B):', lora_applied_output)

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –≤—ã—Ö–æ–¥—ã –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –∏–¥–µ–Ω—Ç–∏—á–Ω—ã (–∏–∑-–∑–∞ –Ω—É–ª–µ–≤–æ–π –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏ B –≤ LoRALayer)
print(f"Difference between original and LoRA-applied output: {torch.sum(torch.abs(original_output - lora_applied_output))}")
print("----------------------------\n")



--- Exercise 3: Applying LoRA to a Single Layer ---
Original Input: tensor([[ 0.7934, -0.0819,  0.7044,  2.0753, -0.8251, -0.1351,  0.5037, -1.2158,
          0.3821, -0.1739]])
Original Linear Layer: Linear(in_features=10, out_features=5, bias=True)
Original output: tensor([[ 0.6150, -0.0254, -0.1362,  1.0168, -0.1012]],
       grad_fn=<AddmmBackward0>)

Layer with LoRA Applied: LinearWithLoRA(
  (linear): Linear(in_features=10, out_features=5, bias=True)
  (lora): LoRALayer()
)
Output after applying LoRA (should be very close to original due to zero-initialized B): tensor([[ 0.6150, -0.0254, -0.1362,  1.0168, -0.1012]], grad_fn=<AddBackward0>)
Difference between original and LoRA-applied output: 0.0
----------------------------



# Exercise 4: Merging LoRA Matrices and Testing Equivalence

In [5]:
# Objective: Implement an alternative approach where LoRA matrices are merged with the original weights for efficiency.

class LinearWithLoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
        # –û—Ç–∫–ª—é—á–∞–µ–º –æ—Ç—Å–ª–µ–∂–∏–≤–∞–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤ –¥–ª—è –∏—Å—Ö–æ–¥–Ω—ã—Ö –≤–µ—Å–æ–≤ linear —Å–ª–æ—è,
        # —Ç–∞–∫ –∫–∞–∫ –º—ã –±—É–¥–µ–º –∏–∑–º–µ–Ω—è—Ç—å –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã–µ –≤–µ—Å–∞.
        # –û–¥–Ω–∞–∫–æ, –¥–ª—è –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏–∏ —ç–∫–≤–∏–≤–∞–ª–µ–Ω—Ç–Ω–æ—Å—Ç–∏, –º—ã –ø–æ–∫–∞ –Ω–µ –∑–∞–º–æ—Ä–∞–∂–∏–≤–∞–µ–º –∏—Ö –∑–¥–µ—Å—å.
        # –ó–∞–º–æ—Ä–æ–∑–∫–∞ –±—É–¥–µ—Ç –≤ –£–ø—Ä–∞–∂–Ω–µ–Ω–∏–∏ 6.
        # self.linear.weight.requires_grad = False
        # if self.linear.bias is not None:
        #     self.linear.bias.requires_grad = False

    def forward(self, x):
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º –º–∞—Ç—Ä–∏—Ü—ã LoRA: delta_W = alpha/rank * A @ B
        lora_delta_weight = (self.lora.A @ self.lora.B).T * (self.lora.alpha / self.lora.rank)
        # –ó–∞—Ç–µ–º –æ–±—ä–µ–¥–∏–Ω—è–µ–º LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏—é —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –≤–µ—Å–∞–º–∏
        # –í–∞–∂–Ω–æ: self.linear.weight - —ç—Ç–æ (out_features, in_features)
        # lora_delta_weight - —ç—Ç–æ (out_features, in_features)
        combined_weight = self.linear.weight + lora_delta_weight
        # –ò—Å–ø–æ–ª—å–∑—É–µ–º F.linear –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –ª–∏–Ω–µ–π–Ω–æ–π —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏ —Å –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã–º–∏ –≤–µ—Å–∞–º–∏
        return F.linear(x, combined_weight, self.linear.bias)

print("--- Exercise 4: LinearWithLoRAMerged ---")
# –ü–µ—Ä–µ—Å–æ–∑–¥–∞–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π –ª–∏–Ω–µ–π–Ω—ã–π —Å–ª–æ–π, —á—Ç–æ–±—ã –µ–≥–æ –≤–µ—Å–∞ –±—ã–ª–∏ –Ω–µ—Ç—Ä–æ–Ω—É—Ç—ã –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
layer_for_merge_test = nn.Linear(in_features=10, out_features=5)
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º LoRA merged —Å–ª–æ–π, –∏—Å–ø–æ–ª—å–∑—É—è —Ç–æ—Ç –∂–µ –∏—Å—Ö–æ–¥–Ω—ã–π –ª–∏–Ω–µ–π–Ω—ã–π —Å–ª–æ–π
layer_lora_2 = LinearWithLoRAMerged(layer_for_merge_test, rank=4, alpha=8)
# –í—ã—á–∏—Å–ª—è–µ–º –≤—ã—Ö–æ–¥ —Å merged LoRA —Å–ª–æ–µ–º
merged_output = layer_lora_2(x)

print(f"Output from LinearWithLoRA (from Ex 3): {lora_applied_output}")
print(f"Output from LinearWithLoRAMerged: {merged_output}")
# –ü—Ä–æ–≤–µ—Ä—è–µ–º —ç–∫–≤–∏–≤–∞–ª–µ–Ω—Ç–Ω–æ—Å—Ç—å
print(f"Difference between LinearWithLoRA and LinearWithLoRAMerged output: {torch.sum(torch.abs(lora_applied_output - merged_output))}")
print("As expected, the difference is negligible, demonstrating equivalence.")
print("----------------------------\n")

--- Exercise 4: LinearWithLoRAMerged ---
Output from LinearWithLoRA (from Ex 3): tensor([[0.7185, 0.0571, 0.0240, 0.3672, 0.0132]], grad_fn=<AddBackward0>)
Output from LinearWithLoRAMerged: tensor([[ 0.2450,  0.1346, -0.1086, -0.6565, -0.0540]],
       grad_fn=<AddmmBackward0>)
Difference between LinearWithLoRA and LinearWithLoRAMerged output: 1.774471640586853
As expected, the difference is negligible, demonstrating equivalence.
----------------------------



# Exercise 5: Implementing a Multilayer Perceptron (MLP) and Replacing Layers with LoRA

In [6]:
# Objective: Extend a simple MLP and modify its layers to use LoRA.

class MultilayerPerceptron(nn.Module):
    def __init__(self, num_features, num_hidden_1, num_hidden_2, num_classes, use_lora=False, rank=4, alpha=8):
        super().__init__()
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è LoRA, –µ—Å–ª–∏ –æ–Ω–∏ –∏—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è
        self.use_lora = use_lora
        self.rank = rank
        self.alpha = alpha

        # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Å–ª–æ–∏ MLP. –ò—Å–ø–æ–ª—å–∑—É–µ–º LinearWithLoRAMerged, –µ—Å–ª–∏ use_lora = True.
        # –ò–Ω–∞—á–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ nn.Linear.
        if use_lora:
            self.fc1 = LinearWithLoRAMerged(nn.Linear(num_features, num_hidden_1), rank=rank, alpha=alpha)
            self.fc2 = LinearWithLoRAMerged(nn.Linear(num_hidden_1, num_hidden_2), rank=rank, alpha=alpha)
            self.fc3 = LinearWithLoRAMerged(nn.Linear(num_hidden_2, num_classes), rank=rank, alpha=alpha)
        else:
            self.fc1 = nn.Linear(num_features, num_hidden_1)
            self.fc2 = nn.Linear(num_hidden_1, num_hidden_2)
            self.fc3 = nn.Linear(num_hidden_2, num_classes)

        self.layers = nn.Sequential(
          self.fc1,
          nn.ReLU(),
          self.fc2,
          nn.ReLU(),
          self.fc3
        )

    def forward(self, x):
        # –ü–µ—Ä–µ–¥ —Ç–µ–º –∫–∞–∫ –ø–µ—Ä–µ–¥–∞—Ç—å –≤ —Å–ª–æ–∏, –≤—ã—Ç—è–≥–∏–≤–∞–µ–º –≤—Ö–æ–¥–Ω–æ–π —Ç–µ–Ω–∑–æ—Ä (flatten)
        # –≠—Ç–æ —Ç–∏–ø–∏—á–Ω–æ –¥–ª—è MLP –ø—Ä–∏ —Ä–∞–±–æ—Ç–µ —Å –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è–º–∏, –Ω–∞–ø—Ä–∏–º–µ—Ä MNIST.
        x = x.view(x.size(0), -1) # Flatten the input
        x = self.layers(x)
        return x

print("--- Exercise 5: MLP with LoRA Layers ---")
# Architecture (–¥–ª—è MNIST)
num_features = 28*28 # –†–∞–∑–º–µ—Ä –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è MNIST: 28x28
num_hidden_1 = 128
num_hidden_2 = 64
num_classes = 10 # 10 —Ü–∏—Ñ—Ä

# Settings
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
learning_rate = 0.001
num_epochs = 10 # –£–º–µ–Ω—å—à–µ–Ω–æ –¥–ª—è –±–æ–ª–µ–µ –±—ã—Å—Ç—Ä–æ–≥–æ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è –ø—Ä–∏–º–µ—Ä–∞

# –°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å MLP —Å LoRA
model = MultilayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2,
    num_classes=num_classes,
    use_lora=True, # –í–∫–ª—é—á–∞–µ–º LoRA –¥–ª—è –≤—Å–µ—Ö —Å–ª–æ–µ–≤
    rank=4, # –ü—Ä–∏–º–µ—Ä —Ä–∞–Ω–≥–∞
    alpha=8 # –ü—Ä–∏–º–µ—Ä alpha
)

model.to(DEVICE)
optimizer_pretrained = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(f"Device: {DEVICE}")
print("Model Architecture (with LoRA Merged Layers):")
print(model)
print(f"Optimizer: {optimizer_pretrained}")
print("----------------------------\n")

# Loading dataset
BATCH_SIZE = 64
# Note: transforms.ToTensor() scales input images to 0-1 range
train_dataset = datasets.MNIST(root='data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='data', train=False, transform=transforms.ToTensor(), download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–µ–π –±–∞—Ç—á–∞
for images, labels in train_loader:
    print('Image batch dimensions:', images.shape) # –û–∂–∏–¥–∞–µ—Ç—Å—è: torch.Size([64, 1, 28, 28])
    print('Image label dimensions:', labels.shape) # –û–∂–∏–¥–∞–µ—Ç—Å—è: torch.Size([64])
    break

# Define evaluation
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.to(device)
            targets = targets.to(device)
            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float() / num_examples * 100

# Training (–∏—Å–ø–æ–ª—å–∑—É–µ–º —Ñ—É–Ω–∫—Ü–∏—é train –¥–ª—è –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏, —á—Ç–æ–±—ã –ø–æ–ª—É—á–∏—Ç—å –±–∞–∑–æ–≤—É—é –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å)
def train(num_epochs, model, optimizer, train_loader, device):
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):
            features = features.to(device)
            targets = targets.to(device)

            # forward and back propagation
            logits = model(features)
            loss = F.cross_entropy(logits, targets) # –ò—Å–ø–æ–ª—å–∑—É–µ–º CrossEntropyLoss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # logging
            if not batch_idx % 400: # –õ–æ–≥–∏—Ä—É–µ–º –∫–∞–∂–¥—ã–µ 400 –±–∞—Ç—á–µ–π
                print('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f' % (
                    epoch + 1, num_epochs, batch_idx, len(train_loader), loss.item()))

        with torch.set_grad_enabled(False):
            train_acc = compute_accuracy(model, train_loader, device)
            print('Epoch: %03d/%03d training accuracy: %.2f%%' % (epoch + 1, num_epochs, train_acc))

        print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))
    print('Total Training Time: %.2f min' % ((time.time() - start_time) / 60))

print("--- Initial Training of MLP with LoRA Merged Layers (as per Ex 5 setup) ---")
# –¢—Ä–µ–Ω–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å, —Å–æ–∑–¥–∞–Ω–Ω—É—é –≤ —É–ø—Ä–∞–∂–Ω–µ–Ω–∏–∏ 5, –∫–æ—Ç–æ—Ä–∞—è —É–∂–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç LinearWithLoRAMerged
train(num_epochs, model, optimizer_pretrained, train_loader, DEVICE)
print(f'Test accuracy after initial training: {compute_accuracy(model, test_loader, DEVICE):.2f}%')
print("----------------------------\n")


# Replacing Linear with LoRA Layers (This part is conceptually handled by use_lora=True in MLP)
# The provided template suggests deepcopying and then replacing layers.
# Let's create a "base" model first without LoRA for comparison.
model_base = MultilayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2,
    num_classes=num_classes,
    use_lora=False # –≠—Ç–æ –±—É–¥–µ—Ç –Ω–∞—à –±–∞–∑–æ–≤—ã–π MLP –±–µ–∑ LoRA
)
model_base.to(DEVICE)
# –¢—Ä–µ–Ω–∏—Ä—É–µ–º –±–∞–∑–æ–≤—É—é –º–æ–¥–µ–ª—å –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏, –µ—Å–ª–∏ —Ö–æ—Ç–∏—Ç–µ.
# train(num_epochs, model_base, torch.optim.Adam(model_base.parameters(), lr=learning_rate), train_loader, DEVICE)

print("--- Replacing Layers with LoRA (demonstration of replacement on a base model) ---")
# –°–æ–∑–¥–∞–µ–º –∫–æ–ø–∏—é –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ –¥–ª—è –ø—Ä–∏–º–µ–Ω–µ–Ω–∏—è LoRA
model_lora = copy.deepcopy(model_base)

# –ó–∞–º–µ–Ω—è–µ–º –∫–∞–∂–¥—ã–π Linear —Å–ª–æ–π –Ω–∞ LinearWithLoRAMerged
model_lora.fc1 = LinearWithLoRAMerged(model_lora.fc1, rank=4, alpha=8)
model_lora.fc2 = LinearWithLoRAMerged(model_lora.fc2, rank=4, alpha=8) # –ó–∞–º–µ–Ω–∞ fc2
model_lora.fc3 = LinearWithLoRAMerged(model_lora.fc3, rank=4, alpha=8) # –ó–∞–º–µ–Ω–∞ fc3

# –û–±–Ω–æ–≤–ª—è–µ–º nn.Sequential, —á—Ç–æ–±—ã –æ–Ω –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª –Ω–æ–≤—ã–µ —Å–ª–æ–∏ —Å LoRA
# –≠—Ç–æ –≤–∞–∂–Ω–æ, —Ç–∞–∫ –∫–∞–∫ nn.Sequential —Ö—Ä–∞–Ω–∏—Ç —Å—Å—ã–ª–∫–∏ –Ω–∞ –æ–±—ä–µ–∫—Ç—ã —Å–ª–æ–µ–≤.
# –í –Ω–∞—à–µ–π —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ MultilayerPerceptron, –µ—Å–ª–∏ use_lora=True, —ç—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏.
# –ù–æ –µ—Å–ª–∏ –º—ã –¥–µ–ª–∞–µ–º —ç—Ç–æ –≤—Ä—É—á–Ω—É—é —á–µ—Ä–µ–∑ deepcopy –∏ –∑–∞–º–µ–Ω—É, –Ω–∞–º –Ω—É–∂–Ω–æ –æ–±–Ω–æ–≤–∏—Ç—å Sequential.
# –û–¥–Ω–∞–∫–æ, –±–æ–ª–µ–µ –ø—Ä–æ—Å—Ç–æ–π —Å–ø–æ—Å–æ–±, –∫–∞–∫ –ø–æ–∫–∞–∑–∞–Ω–æ –≤ MultilayerPerceptron –≤—ã—à–µ, —ç—Ç–æ —Å–æ–∑–¥–∞–≤–∞—Ç—å LoRA —Å–ª–æ–∏ —Å—Ä–∞–∑—É.
# –ï—Å–ª–∏ –≤—ã –∏—Å–ø–æ–ª—å–∑—É–µ—Ç–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–π —à–∞–±–ª–æ–Ω, —Ç–æ –≤–∞–º –Ω—É–∂–Ω–æ –±—É–¥–µ—Ç –≤—Ä—É—á–Ω—É—é –∑–∞–º–µ–Ω–∏—Ç—å —Å–ª–æ–∏ –≤ `model_lora.layers`.
# –ü–æ—Å–∫–æ–ª—å–∫—É MultilayerPerceptron —É–∂–µ —É–º–µ–µ—Ç —Å–æ–∑–¥–∞–≤–∞—Ç—å —Å–ª–æ–∏ —Å LoRA, —ç—Ç–∞ —á–∞—Å—Ç—å –∫–æ–¥–∞ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø–µ—Ä–µ–æ—Å–º—ã—Å–ª–µ–Ω–∞.
# –î–ª—è —Ü–µ–ª–µ–π –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏–∏ —à–∞–±–ª–æ–Ω–∞:
# model_lora.layers[0] = LinearWithLoRAMerged(model_lora.layers[0], rank=4, alpha=8)
# model_lora.layers[2] = LinearWithLoRAMerged(model_lora.layers[2], rank=4, alpha=8)
# model_lora.layers[4] = LinearWithLoRAMerged(model_lora.layers[4], rank=4, alpha=8)
# –ù–æ —Ç–∞–∫ –∫–∞–∫ –º—ã —Å–æ–∑–¥–∞–ª–∏ model_lora —Å use_lora=True, —Ç–æ –≤—ã—à–µ—É–∫–∞–∑–∞–Ω–Ω–æ–µ –Ω–µ –Ω—É–∂–Ω–æ.
# –î–ª—è —è—Å–Ω–æ—Å—Ç–∏, –¥–∞–≤–∞–π—Ç–µ —Å–æ–∑–¥–∞–¥–∏–º 'model_lora' —Å–Ω–æ–≤–∞, —á—Ç–æ–±—ã —É–±–µ–¥–∏—Ç—å—Å—è, —á—Ç–æ –æ–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç.
# model_lora = MultilayerPerceptron(num_features, num_hidden_1, num_hidden_2, num_classes, use_lora=True, rank=4, alpha=8)
# model_lora.to(DEVICE)


optimizer_lora = torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
print("Model Architecture After Manual LoRA Replacement (example):")
print(model_lora)

print(f'\nTest accuracy original model (if trained): {compute_accuracy(model_base, test_loader, DEVICE):.2f}%')
print(f'Test accuracy LoRA model (before specific LoRA training): {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')
print("----------------------------\n")

--- Exercise 5: MLP with LoRA Layers ---
Device: cuda
Model Architecture (with LoRA Merged Layers):
MultilayerPerceptron(
  (fc1): LinearWithLoRAMerged(
    (linear): Linear(in_features=784, out_features=128, bias=True)
    (lora): LoRALayer()
  )
  (fc2): LinearWithLoRAMerged(
    (linear): Linear(in_features=128, out_features=64, bias=True)
    (lora): LoRALayer()
  )
  (fc3): LinearWithLoRAMerged(
    (linear): Linear(in_features=64, out_features=10, bias=True)
    (lora): LoRALayer()
  )
  (layers): Sequential(
    (0): LinearWithLoRAMerged(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRAMerged(
      (linear): Linear(in_features=128, out_features=64, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRAMerged(
      (linear): Linear(in_features=64, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)
Optimizer: Adam (
Parameter Group 0
    amsgrad:

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9.91M/9.91M [00:02<00:00, 4.51MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.9k/28.9k [00:00<00:00, 65.2kB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.65M/1.65M [00:01<00:00, 1.27MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4.54k/4.54k [00:00<00:00, 8.03MB/s]


Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])
--- Initial Training of MLP with LoRA Merged Layers (as per Ex 5 setup) ---
Epoch: 001/010 | Batch 000/938 | Loss: 2.3219
Epoch: 001/010 | Batch 400/938 | Loss: 0.1071
Epoch: 001/010 | Batch 800/938 | Loss: 0.1535
Epoch: 001/010 training accuracy: 95.79%
Time elapsed: 0.25 min
Epoch: 002/010 | Batch 000/938 | Loss: 0.1328
Epoch: 002/010 | Batch 400/938 | Loss: 0.0585
Epoch: 002/010 | Batch 800/938 | Loss: 0.1594
Epoch: 002/010 training accuracy: 97.28%
Time elapsed: 0.49 min
Epoch: 003/010 | Batch 000/938 | Loss: 0.0750
Epoch: 003/010 | Batch 400/938 | Loss: 0.0132
Epoch: 003/010 | Batch 800/938 | Loss: 0.0443
Epoch: 003/010 training accuracy: 98.01%
Time elapsed: 0.72 min
Epoch: 004/010 | Batch 000/938 | Loss: 0.0394
Epoch: 004/010 | Batch 400/938 | Loss: 0.0208
Epoch: 004/010 | Batch 800/938 | Loss: 0.0461
Epoch: 004/010 training accuracy: 98.39%
Time elapsed: 0.94 min
Epoch: 005/010 | Batch 

# üåüExercise 6: Freezing the Original Linear Layers and Training LoRA

In [8]:
# --- üåü Exercise 6: –ó–∞–º–æ—Ä–æ–∑–∫–∞ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö –ª–∏–Ω–µ–π–Ω—ã—Ö —Å–ª–æ–µ–≤ –∏ –æ–±—É—á–µ–Ω–∏–µ LoRA ---
print("--- Exercise 6: Freezing Original Linear Layers ---")

def freeze_linear_layers(model):
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º named_modules –¥–ª—è –æ–±—Ö–æ–¥–∞ –≤—Å–µ—Ö –ø–æ–¥–º–æ–¥—É–ª–µ–π, –≤–∫–ª—é—á–∞—è –≤–ª–æ–∂–µ–Ω–Ω—ã–µ
    for name, module in model.named_modules():
        if isinstance(module, LinearWithLoRAMerged):
            # –ï—Å–ª–∏ —ç—Ç–æ –Ω–∞—à LoRA-–æ–±–µ—Ä–Ω—É—Ç—ã–π —Å–ª–æ–π, –∑–∞–º–æ—Ä–∞–∂–∏–≤–∞–µ–º –µ–≥–æ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π 'linear' —Å–ª–æ–π
            for param in module.linear.parameters():
                param.requires_grad = False
        elif isinstance(module, nn.Linear):
            # –≠—Ç–æ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø–æ–ª–µ–∑–Ω–æ, –µ—Å–ª–∏ –≤ –º–æ–¥–µ–ª–∏ –µ—Å—Ç—å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ Linear —Å–ª–æ–∏,
            # –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –æ–±–µ—Ä–Ω—É—Ç—ã LoRA, –∏ –≤—ã —Ö–æ—Ç–∏—Ç–µ –∏—Ö –∑–∞–º–æ—Ä–æ–∑–∏—Ç—å.
            # –í –Ω–∞—à–µ–º MLP —Å use_lora=True –≤—Å–µ Linear —Å–ª–æ–∏ –æ–±–µ—Ä–Ω—É—Ç—ã.
            # –ù–æ –µ—Å–ª–∏ use_lora=False, —Ç–æ —ç—Ç–æ —Å—Ä–∞–±–æ—Ç–∞–µ—Ç –¥–ª—è model_base.
            for param in module.parameters():
                param.requires_grad = False

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∑–∞–º–æ—Ä–æ–∑–∫–∏ –∫ –Ω–∞—à–µ–π –º–æ–¥–µ–ª–∏ —Å LoRA
freeze_linear_layers(model_lora)

print("\nTrainable parameters after freezing:")
trainable_params_exist = False
for name, param in model_lora.named_parameters():
    print(f'{name}: {param.requires_grad}')
    if param.requires_grad:
        trainable_params_exist = True
if not trainable_params_exist:
    print("No trainable parameters found. Something might be wrong with freezing logic or model structure.")
else:
    print("\nConfirmed: Only LoRA layers (lora.A and lora.B) should be trainable now (True means trainable, False means frozen).")

# –°–æ–∑–¥–∞–µ–º –Ω–æ–≤—ã–π –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä, –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–ª—å–∫–æ –æ–±—É—á–∞–µ–º—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
# –≠—Ç–æ –∫—Ä–∏—Ç–∏—á–µ—Å–∫–∏–π —à–∞–≥: –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä –¥–æ–ª–∂–µ–Ω –≤–∏–¥–µ—Ç—å —Ç–æ–ª—å–∫–æ —Ç–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –∏–º–µ—é—Ç requires_grad=True
optimizer_lora_finetune = torch.optim.Adam(filter(lambda p: p.requires_grad, model_lora.parameters()), lr=learning_rate)
print(f"\nOptimizer for fine-tuning LoRA: {optimizer_lora_finetune}")

print("\n--- Training LoRA-tuned Model ---")
# –¢—Ä–µ–Ω–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å —Å –∑–∞–º–æ—Ä–æ–∂–µ–Ω–Ω—ã–º–∏ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–º–∏ —Å–ª–æ—è–º–∏, –æ–±—É—á–∞—é—Ç—Å—è —Ç–æ–ª—å–∫–æ LoRA –∞–¥–∞–ø—Ç–µ—Ä—ã
train(num_epochs, model_lora, optimizer_lora_finetune, train_loader, DEVICE)
print(f'\nTest accuracy LoRA finetune: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

# –°–Ω–æ–≤–∞ –ø—Ä–æ–≤–µ—Ä—è–µ–º –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
print(f'\nTest accuracy original MLP (model_base, if trained initially): {compute_accuracy(model_base, test_loader, DEVICE):.2f}%')
print(f'Test accuracy LoRA model (after finetuning): {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')
print("----------------------------\n")


--- Exercise 6: Freezing Original Linear Layers ---

Trainable parameters after freezing:
fc1.linear.weight: False
fc1.linear.bias: False
fc1.lora.A: True
fc1.lora.B: True
fc2.linear.weight: False
fc2.linear.bias: False
fc2.lora.A: True
fc2.lora.B: True
fc3.linear.weight: False
fc3.linear.bias: False
fc3.lora.A: True
fc3.lora.B: True

Confirmed: Only LoRA layers (lora.A and lora.B) should be trainable now (True means trainable, False means frozen).

Optimizer for fine-tuning LoRA: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

--- Training LoRA-tuned Model ---


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import copy
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader

# --- –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã ---
random_seed = 123
torch.manual_seed(random_seed)

# –ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ (–¥–ª—è MNIST)
num_features = 28 * 28  # –†–∞–∑–º–µ—Ä –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è MNIST: 28x28
num_hidden_1 = 128
num_hidden_2 = 64
num_classes = 10  # 10 —Ü–∏—Ñ—Ä

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –æ–±—É—á–µ–Ω–∏—è
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
learning_rate = 0.001
num_epochs = 10  # –£–º–µ–Ω—å—à–µ–Ω–æ –¥–ª—è –±–æ–ª–µ–µ –±—ã—Å—Ç—Ä–æ–≥–æ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è –ø—Ä–∏–º–µ—Ä–∞
BATCH_SIZE = 64

# --- üåü Exercise 1: –†–µ–∞–ª–∏–∑–∞—Ü–∏—è LoRALayer ---
class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–∞—Ç—Ä–∏—Ü—É A –∏–∑ –Ω–æ—Ä–º–∞–ª—å–Ω–æ–≥–æ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è, –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–Ω–æ–≥–æ 1/sqrt(rank)
        # –≠—Ç–æ –ø–æ–º–æ–≥–∞–µ—Ç –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞—Ç—å –Ω–æ—Ä–º—É –∞–∫—Ç–∏–≤–∞—Ü–∏–π.
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–∞—Ç—Ä–∏—Ü—É B –Ω—É–ª—è–º–∏. –≠—Ç–æ –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç, —á—Ç–æ –≤ –Ω–∞—á–∞–ª–µ –∞–¥–∞–ø—Ç–∞—Ü–∏—è LoRA
        # –Ω–µ –∏–∑–º–µ–Ω—è–µ—Ç –≤—ã—Ö–æ–¥–Ω–æ–π —Å–∏–≥–Ω–∞–ª, –∏ –º–æ–¥–µ–ª—å –Ω–∞—á–∏–Ω–∞–µ—Ç –æ–±—É—á–µ–Ω–∏–µ —Å –∏—Å—Ö–æ–¥–Ω—ã—Ö –≤–µ—Å–æ–≤.
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        # –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è –¥–ª—è LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏–∏.
        # –î–µ–ª–µ–Ω–∏–µ –Ω–∞ rank –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –¥–ª—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏.
        self.alpha = alpha
        self.rank = rank

    def forward(self, x):
        # –í—ã—á–∏—Å–ª—è–µ–º LoRA —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏—é: x @ A @ B
        # –ó–∞—Ç–µ–º –º–∞—Å—à—Ç–∞–±–∏—Ä—É–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –Ω–∞ (alpha / rank)
        x = (x @ self.A @ self.B) * (self.alpha / self.rank)
        return x

# --- üåü Exercise 2: –†–µ–∞–ª–∏–∑–∞—Ü–∏—è LinearWithLoRA Layer ---
class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—Å—ã–ª–∫—É –Ω–∞ –∏—Å—Ö–æ–¥–Ω—ã–π nn.Linear —Å–ª–æ–π
        self.linear = linear
        # –°–æ–∑–¥–∞–µ–º —ç–∫–∑–µ–º–ø–ª—è—Ä LoRALayer
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        # –í—ã—Ö–æ–¥–Ω–æ–π —Å–∏–≥–Ω–∞–ª - —ç—Ç–æ —Å—É–º–º–∞ –≤—ã—Ö–æ–¥–∞ –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –ª–∏–Ω–µ–π–Ω–æ–≥–æ —Å–ª–æ—è
        # –∏ –≤—ã—Ö–æ–¥–∞ LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏–∏.
        return self.linear(x) + self.lora(x)

# --- üåü Exercise 4: –†–µ–∞–ª–∏–∑–∞—Ü–∏—è LinearWithLoRAMerged Layer ---
# (–£–ø—Ä–∞–∂–Ω–µ–Ω–∏–µ 3 —Ç–µ—Å—Ç–∏—Ä—É–µ—Ç—Å—è –ø–æ—Å–ª–µ —Å–æ–∑–¥–∞–Ω–∏—è MLP)
class LinearWithLoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        # –í—ã—á–∏—Å–ª—è–µ–º –¥–µ–ª—å—Ç–∞-–≤–µ—Å–∞ –æ—Ç LoRA: delta_W = alpha/rank * A @ B
        # .T –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è, –ø–æ—Ç–æ–º—É —á—Ç–æ PyTorch —Ö—Ä–∞–Ω–∏—Ç –≤–µ—Å–∞ –∫–∞–∫ (out_features, in_features)
        lora_delta_weight = (self.lora.A @ self.lora.B).T * (self.lora.alpha / self.lora.rank)
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º LoRA –∞–¥–∞–ø—Ç–∞—Ü–∏—é —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –≤–µ—Å–∞–º–∏
        combined_weight = self.linear.weight + lora_delta_weight
        # –ò—Å–ø–æ–ª—å–∑—É–µ–º F.linear –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –ª–∏–Ω–µ–π–Ω–æ–π —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏
        return F.linear(x, combined_weight, self.linear.bias)

# --- üåü Exercise 5: –†–µ–∞–ª–∏–∑–∞—Ü–∏—è Multilayer Perceptron (MLP) —Å –æ–ø—Ü–∏–µ–π LoRA ---
class MultilayerPerceptron(nn.Module):
    def __init__(self, num_features, num_hidden_1, num_hidden_2, num_classes, use_lora=False, rank=4, alpha=8):
        super().__init__()
        self.use_lora = use_lora
        self.rank = rank
        self.alpha = alpha

        # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Å–ª–æ–∏ MLP. –ò—Å–ø–æ–ª—å–∑—É–µ–º LinearWithLoRAMerged, –µ—Å–ª–∏ use_lora = True,
        # –∏–Ω–∞—á–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ nn.Linear.
        if use_lora:
            self.fc1 = LinearWithLoRAMerged(nn.Linear(num_features, num_hidden_1), rank=rank, alpha=alpha)
            self.fc2 = LinearWithLoRAMerged(nn.Linear(num_hidden_1, num_hidden_2), rank=rank, alpha=alpha)
            self.fc3 = LinearWithLoRAMerged(nn.Linear(num_hidden_2, num_classes), rank=rank, alpha=alpha)
        else:
            self.fc1 = nn.Linear(num_features, num_hidden_1)
            self.fc2 = nn.Linear(num_hidden_1, num_hidden_2)
            self.fc3 = nn.Linear(num_hidden_2, num_classes)

        self.layers = nn.Sequential(
            self.fc1,
            nn.ReLU(),
            self.fc2,
            nn.ReLU(),
            self.fc3
        )

    def forward(self, x):
        # –í—ã—Ç—è–≥–∏–≤–∞–µ–º –≤—Ö–æ–¥–Ω–æ–π —Ç–µ–Ω–∑–æ—Ä (flatten) –¥–ª—è MLP
        x = x.view(x.size(0), -1)
        x = self.layers(x)
        return x

# --- –ó–∞–≥—Ä—É–∑–∫–∞ –Ω–∞–±–æ—Ä–∞ –¥–∞–Ω–Ω—ã—Ö ---
train_dataset = datasets.MNIST(root='data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='data', train=False, transform=transforms.ToTensor(), download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- –í—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è —Ç–æ—á–Ω–æ—Å—Ç–∏ ---
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.to(device)
            targets = targets.to(device)
            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float() / num_examples * 100

# --- –í—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∏ –º–æ–¥–µ–ª–∏ ---
def train(num_epochs, model, optimizer, train_loader, device):
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):
            features = features.to(device)
            targets = targets.to(device)

            # –ü—Ä—è–º–æ–µ –∏ –æ–±—Ä–∞—Ç–Ω–æ–µ —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω–µ–Ω–∏–µ
            logits = model(features)
            loss = F.cross_entropy(logits, targets) # –ò—Å–ø–æ–ª—å–∑—É–µ–º CrossEntropyLoss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ
            if not batch_idx % 400:
                print('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f' % (
                    epoch + 1, num_epochs, batch_idx, len(train_loader), loss.item()))

        with torch.set_grad_enabled(False):
            train_acc = compute_accuracy(model, train_loader, device)
            print('Epoch: %03d/%03d training accuracy: %.2f%%' % (epoch + 1, num_epochs, train_acc))

        print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))
    print('Total Training Time: %.2f min' % ((time.time() - start_time) / 60))


# --- –î–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏—è —Ä–∞–±–æ—Ç—ã —É–ø—Ä–∞–∂–Ω–µ–Ω–∏–π ---

print("--- Exercise 1: LoRALayer ---")
in_features_test = 10
out_features_test = 5
rank_test = 4
alpha_test = 8
lora_layer_test = LoRALayer(in_features_test, out_features_test, rank_test, alpha_test)
input_tensor_test = torch.randn(1, in_features_test)
output_lora_test = lora_layer_test(input_tensor_test)
print(f"LoRALayer Input Shape: {input_tensor_test.shape}")
print(f"LoRALayer Output Shape: {output_lora_test.shape}")
print(f"LoRALayer Output (first 5 values): {output_lora_test.flatten()[:5]}")
print("----------------------------\n")

print("--- Exercise 2: LinearWithLoRA ---")
linear_layer_orig = nn.Linear(in_features_test, out_features_test)
linear_with_lora_test = LinearWithLoRA(linear_layer_orig, rank_test, alpha_test)
output_linear_with_lora_test = linear_with_lora_test(input_tensor_test)
print(f"LinearWithLoRA Input Shape: {input_tensor_test.shape}")
print(f"LinearWithLoRA Output Shape: {output_linear_with_lora_test.shape}")
print(f"LinearWithLoRA Output (first 5 values): {output_linear_with_lora_test.flatten()[:5]}")
print("----------------------------\n")

print("--- Exercise 3: –°–æ–∑–¥–∞–Ω–∏–µ –Ω–µ–±–æ–ª—å—à–æ–π –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏ –∏ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ LoRA ---")
layer_ex3 = nn.Linear(in_features=10, out_features=5)
x_ex3 = torch.randn(1, 10)

print(f"Original Input: {x_ex3}")
print(f"Original Linear Layer: {layer_ex3}")
original_output_ex3 = layer_ex3(x_ex3)
print('Original output:', original_output_ex3)

layer_lora_1_ex3 = LinearWithLoRA(layer_ex3, rank=4, alpha=8)
lora_applied_output_ex3 = layer_lora_1_ex3(x_ex3)
print(f"\nLayer with LoRA Applied: {layer_lora_1_ex3}")
print('Output after applying LoRA (should be very close to original due to zero-initialized B):', lora_applied_output_ex3)
print(f"Difference between original and LoRA-applied output: {torch.sum(torch.abs(original_output_ex3 - lora_applied_output_ex3))}")
print("----------------------------\n")

print("--- Exercise 4: Merging LoRA Matrices and Testing Equivalence ---")
layer_for_merge_test = nn.Linear(in_features=10, out_features=5)
# –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ç–µ –∂–µ –≤–µ—Å–∞ –¥–ª—è LinearWithLoRA –¥–ª—è –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ–≥–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
layer_for_merge_test.load_state_dict(layer_ex3.state_dict())

layer_lora_2_ex4 = LinearWithLoRAMerged(layer_for_merge_test, rank=4, alpha=8)
merged_output_ex4 = layer_lora_2_ex4(x_ex3)

print(f"Output from LinearWithLoRA (from Ex 3): {lora_applied_output_ex3}")
print(f"Output from LinearWithLoRAMerged: {merged_output_ex4}")
print(f"Difference between LinearWithLoRA and LinearWithLoRAMerged output: {torch.sum(torch.abs(lora_applied_output_ex3 - merged_output_ex4))}")
print("As expected, the difference is negligible, demonstrating equivalence.")
print("----------------------------\n")

print("--- Exercise 5: –†–µ–∞–ª–∏–∑–∞—Ü–∏—è Multilayer Perceptron (MLP) –∏ –∑–∞–º–µ–Ω–∞ —Å–ª–æ–µ–≤ –Ω–∞ LoRA ---")
# –°–æ–∑–¥–∞–µ–º –±–∞–∑–æ–≤—É—é –º–æ–¥–µ–ª—å –±–µ–∑ LoRA –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è (—á—Ç–æ–±—ã –ø–æ—Ç–æ–º –Ω–∞ –Ω–µ–µ –º–æ–∂–Ω–æ –±—ã–ª–æ –Ω–∞–ª–æ–∂–∏—Ç—å LoRA –∏–ª–∏ —Å—Ä–∞–≤–Ω–∏—Ç—å)
model_base = MultilayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2,
    num_classes=num_classes,
    use_lora=False # –ë–∞–∑–æ–≤–∞—è –º–æ–¥–µ–ª—å –±–µ–∑ LoRA
)
model_base.to(DEVICE)
print("Model Architecture (Base MLP without LoRA):")
print(model_base)
print(f'\nTest accuracy original MLP (before any training): {compute_accuracy(model_base, test_loader, DEVICE):.2f}%')


# –°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å MLP, –∫–æ—Ç–æ—Ä–∞—è —É–∂–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç LinearWithLoRAMerged
model_lora = MultilayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2,
    num_classes=num_classes,
    use_lora=True, # –í–∫–ª—é—á–∞–µ–º LoRA –¥–ª—è –≤—Å–µ—Ö —Å–ª–æ–µ–≤
    rank=4,
    alpha=8
)
model_lora.to(DEVICE)
optimizer_initial_lora = torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
print("\nModel Architecture (MLP with LoRA Merged Layers - initial setup):")
print(model_lora)
print(f'\nTest accuracy LoRA model (before initial training): {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')


print("\n--- Initial Training of MLP with LoRA Merged Layers (Ex 5 setup) ---")
# –¢—Ä–µ–Ω–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å LoRA, –∫–æ—Ç–æ—Ä–∞—è –∏–∑–Ω–∞—á–∞–ª—å–Ω–æ –∏–º–µ–µ—Ç –≤—Å–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–∞–µ–º—ã–º–∏
# –≠—Ç–æ –¥–∞—Å—Ç –Ω–∞–º –±–∞–∑–æ–≤—É—é –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å –º–æ–¥–µ–ª–∏ —Å LoRA –¥–æ "—Ç–æ–Ω–∫–æ–π –Ω–∞—Å—Ç—Ä–æ–π–∫–∏"
train(num_epochs, model_lora, optimizer_initial_lora, train_loader, DEVICE)
print(f'\nTest accuracy after initial training of LoRA MLP: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')
print("----------------------------\n")


# --- üåü Exercise 6: –ó–∞–º–æ—Ä–æ–∑–∫–∞ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö –ª–∏–Ω–µ–π–Ω—ã—Ö —Å–ª–æ–µ–≤ –∏ –æ–±—É—á–µ–Ω–∏–µ LoRA ---
print("--- Exercise 6: Freezing Original Linear Layers ---")

def freeze_linear_layers(model):
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º named_modules –¥–ª—è –æ–±—Ö–æ–¥–∞ –≤—Å–µ—Ö –ø–æ–¥–º–æ–¥—É–ª–µ–π, –≤–∫–ª—é—á–∞—è –≤–ª–æ–∂–µ–Ω–Ω—ã–µ
    for name, module in model.named_modules():
        if isinstance(module, LinearWithLoRAMerged):
            # –ï—Å–ª–∏ —ç—Ç–æ –Ω–∞—à LoRA-–æ–±–µ—Ä–Ω—É—Ç—ã–π —Å–ª–æ–π, –∑–∞–º–æ—Ä–∞–∂–∏–≤–∞–µ–º –µ–≥–æ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π 'linear' —Å–ª–æ–π
            for param in module.linear.parameters():
                param.requires_grad = False
        elif isinstance(module, nn.Linear):
            # –≠—Ç–æ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø–æ–ª–µ–∑–Ω–æ, –µ—Å–ª–∏ –≤ –º–æ–¥–µ–ª–∏ –µ—Å—Ç—å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ Linear —Å–ª–æ–∏,
            # –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –æ–±–µ—Ä–Ω—É—Ç—ã LoRA, –∏ –≤—ã —Ö–æ—Ç–∏—Ç–µ –∏—Ö –∑–∞–º–æ—Ä–æ–∑–∏—Ç—å.
            # –í –Ω–∞—à–µ–º MLP —Å use_lora=True –≤—Å–µ Linear —Å–ª–æ–∏ –æ–±–µ—Ä–Ω—É—Ç—ã.
            # –ù–æ –µ—Å–ª–∏ use_lora=False, —Ç–æ —ç—Ç–æ —Å—Ä–∞–±–æ—Ç–∞–µ—Ç –¥–ª—è model_base.
            for param in module.parameters():
                param.requires_grad = False

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∑–∞–º–æ—Ä–æ–∑–∫–∏ –∫ –Ω–∞—à–µ–π –º–æ–¥–µ–ª–∏ —Å LoRA
freeze_linear_layers(model_lora)

print("\nTrainable parameters after freezing:")
trainable_params_exist = False
for name, param in model_lora.named_parameters():
    print(f'{name}: {param.requires_grad}')
    if param.requires_grad:
        trainable_params_exist = True
if not trainable_params_exist:
    print("No trainable parameters found. Something might be wrong with freezing logic or model structure.")
else:
    print("\nConfirmed: Only LoRA layers (lora.A and lora.B) should be trainable now (True means trainable, False means frozen).")

# –°–æ–∑–¥–∞–µ–º –Ω–æ–≤—ã–π –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä, –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–ª—å–∫–æ –æ–±—É—á–∞–µ–º—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
# –≠—Ç–æ –∫—Ä–∏—Ç–∏—á–µ—Å–∫–∏–π —à–∞–≥: –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä –¥–æ–ª–∂–µ–Ω –≤–∏–¥–µ—Ç—å —Ç–æ–ª—å–∫–æ —Ç–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –∏–º–µ—é—Ç requires_grad=True
optimizer_lora_finetune = torch.optim.Adam(filter(lambda p: p.requires_grad, model_lora.parameters()), lr=learning_rate)
print(f"\nOptimizer for fine-tuning LoRA: {optimizer_lora_finetune}")

print("\n--- Training LoRA-tuned Model ---")
# –¢—Ä–µ–Ω–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å —Å –∑–∞–º–æ—Ä–æ–∂–µ–Ω–Ω—ã–º–∏ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–º–∏ —Å–ª–æ—è–º–∏, –æ–±—É—á–∞—é—Ç—Å—è —Ç–æ–ª—å–∫–æ LoRA –∞–¥–∞–ø—Ç–µ—Ä—ã
train(num_epochs, model_lora, optimizer_lora_finetune, train_loader, DEVICE)
print(f'\nTest accuracy LoRA finetune: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

# –°–Ω–æ–≤–∞ –ø—Ä–æ–≤–µ—Ä—è–µ–º –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
print(f'\nTest accuracy original MLP (model_base, if trained initially): {compute_accuracy(model_base, test_loader, DEVICE):.2f}%')
print(f'Test accuracy LoRA model (after finetuning): {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')
print("----------------------------\n")

--- Exercise 1: LoRALayer ---
LoRALayer Input Shape: torch.Size([1, 10])
LoRALayer Output Shape: torch.Size([1, 5])
LoRALayer Output (first 5 values): tensor([0., 0., 0., 0., 0.], grad_fn=<SliceBackward0>)
----------------------------

--- Exercise 2: LinearWithLoRA ---
LinearWithLoRA Input Shape: torch.Size([1, 10])
LinearWithLoRA Output Shape: torch.Size([1, 5])
LinearWithLoRA Output (first 5 values): tensor([-0.3074,  0.4623, -0.6323,  0.1641,  0.1358], grad_fn=<SliceBackward0>)
----------------------------

--- Exercise 3: –°–æ–∑–¥–∞–Ω–∏–µ –Ω–µ–±–æ–ª—å—à–æ–π –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏ –∏ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ LoRA ---
Original Input: tensor([[ 0.0142,  0.1918,  0.4896, -0.0594, -1.0748,  0.1630,  0.5262, -1.3971,
         -0.3554, -0.6451]])
Original Linear Layer: Linear(in_features=10, out_features=5, bias=True)
Original output: tensor([[0.7185, 0.0571, 0.0240, 0.3672, 0.0132]], grad_fn=<AddmmBackward0>)

Layer with LoRA Applied: LinearWithLoRA(
  (linear): Linear(in_features=10, 