correct implementation for DoRA

KohakuBlueleaf · Mar 16, 2024 · 47922c5 · 47922c5
1 parent 1fa03b8
commit 47922c5
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 12 deletions.
diff --git a/lycoris/modules/locon.py b/lycoris/modules/locon.py
@@ -109,9 +109,15 @@ def __init__(
         self.wd = weight_decompose
         if self.wd:
             org_weight: nn.Parameter = org_module.weight
-            self.dora_mean_dim = tuple(i for i in range(org_weight.dim()) if i != 1)
+            self.dora_norm_dims = org_weight.dim() - 1
             self.dora_scale = nn.Parameter(
-                torch.mean(org_weight, dim=self.dora_mean_dim, keepdim=True)
+                torch.norm(
+                    org_weight.transpose(1, 0).reshape(org_weight.shape[1], -1),
+                    dim=1,
+                    keepdim=True,
+                )
+                .reshape(org_weight.shape[1], *[1] * self.dora_norm_dims)
+                .transpose(1, 0)
             ).float()
 
         if dropout:
@@ -190,10 +196,16 @@ def make_weight(self, device=None):
         return weight * self.scalar.to(device)
 
     def apply_weight_decompose(self, weight):
-        return weight * (
-            self.dora_scale / weight.mean(dim=self.dora_mean_dim, keepdim=True)
+        weight_norm = (
+            weight.transpose(0, 1)
+            .reshape(weight.shape[1], -1)
+            .norm(dim=1, keepdim=True)
+            .reshape(weight.shape[1], *[1] * self.dora_norm_dims)
+            .transpose(0, 1)
         )
 
+        return weight * (self.dora_scale / weight_norm)
+
     def custom_state_dict(self):
         destination = {}
         if self.wd:

diff --git a/lycoris/modules/loha.py b/lycoris/modules/loha.py
@@ -175,9 +175,15 @@ def __init__(
         self.wd = weight_decompose
         if self.wd:
             org_weight: nn.Parameter = org_module.weight
-            self.dora_mean_dim = tuple(i for i in range(org_weight.dim()) if i != 1)
+            self.dora_norm_dims = org_weight.dim() - 1
             self.dora_scale = nn.Parameter(
-                torch.mean(org_weight, dim=self.dora_mean_dim, keepdim=True)
+                torch.norm(
+                    org_weight.transpose(1, 0).reshape(org_weight.shape[1], -1),
+                    dim=1,
+                    keepdim=True,
+                )
+                .reshape(org_weight.shape[1], *[1] * self.dora_norm_dims)
+                .transpose(1, 0)
             ).float()
 
         self.dropout = dropout
@@ -261,10 +267,16 @@ def get_weight(self, shape):
         return weight
 
     def apply_weight_decompose(self, weight):
-        return weight * (
-            self.dora_scale / weight.mean(dim=self.dora_mean_dim, keepdim=True)
+        weight_norm = (
+            weight.transpose(0, 1)
+            .reshape(weight.shape[1], -1)
+            .norm(dim=1, keepdim=True)
+            .reshape(weight.shape[1], *[1] * self.dora_norm_dims)
+            .transpose(0, 1)
         )
 
+        return weight * (self.dora_scale / weight_norm)
+
     def custom_state_dict(self):
         destination = {}
         destination["alpha"] = self.alpha

diff --git a/lycoris/modules/lokr.py b/lycoris/modules/lokr.py
@@ -218,9 +218,15 @@ def __init__(
         self.wd = weight_decompose
         if self.wd:
             org_weight: nn.Parameter = org_module.weight
-            self.dora_mean_dim = tuple(i for i in range(org_weight.dim()) if i != 1)
+            self.dora_norm_dims = org_weight.dim() - 1
             self.dora_scale = nn.Parameter(
-                torch.mean(org_weight, dim=self.dora_mean_dim, keepdim=True)
+                torch.norm(
+                    org_weight.transpose(1, 0).reshape(org_weight.shape[1], -1),
+                    dim=1,
+                    keepdim=True,
+                )
+                .reshape(org_weight.shape[1], *[1] * self.dora_norm_dims)
+                .transpose(1, 0)
             ).float()
 
         self.dropout = dropout
@@ -326,10 +332,16 @@ def get_weight(self, shape):
         return weight
 
     def apply_weight_decompose(self, weight):
-        return weight * (
-            self.dora_scale / weight.mean(dim=self.dora_mean_dim, keepdim=True)
+        weight_norm = (
+            weight.transpose(0, 1)
+            .reshape(weight.shape[1], -1)
+            .norm(dim=1, keepdim=True)
+            .reshape(weight.shape[1], *[1] * self.dora_norm_dims)
+            .transpose(0, 1)
         )
 
+        return weight * (self.dora_scale / weight_norm)
+
     def custom_state_dict(self):
         destination = {}
         destination["alpha"] = self.alpha
@@ -454,6 +466,17 @@ def forward(self, x):
     test_output = lokr(test_input)
     print(test_output.shape)
 
+    # opt = torch.optim.AdamW(lokr.parameters(), lr=1e-2)
+    # for _ in range(100):
+    #     x = torch.randn(128, 128).cuda()
+    #     t = x / 10
+    #     y = lokr(x)
+    #     loss = F.mse_loss(y, t)
+    #     loss.backward()
+    #     opt.step()
+    #     opt.zero_grad()
+    #     print(loss.item())
+
     base_4bit = LinearNF4(128, 128)
     base_4bit.load_state_dict(base.state_dict())
     base_4bit.cuda()