Lightning-AI · lantiga · Apr 11, 2023 · Apr 10, 2023
@@ -55,8 +55,10 @@ def main():
 
     checkpoint = torch.load("checkpoints/lit-llama/7B/state_dict.pth")
 
-    with EmptyInitOnDevice(device=fabric.device, dtype=torch.bfloat16):
-        model = LLaMA(config)
+    with fabric.device:
+        torch.set_default_tensor_type(torch.HalfTensor)
+        model = LLaMA(config).bfloat16()
+        torch.set_default_tensor_type(torch.FloatTensor)
         # strict=False because missing keys due to adapter weights not containted in state dict
         model.load_state_dict(checkpoint, strict=False)
 

@@ -52,14 +52,15 @@ def main():
     config = LLaMAConfig.from_name("7B")
     config.block_size = block_size
 
-    with EmptyInitOnDevice(device=fabric.device, dtype=torch.bfloat16):
-        with lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
-            model = LLaMA(config)
-
     checkpoint = torch.load("checkpoints/lit-llama/7B/state_dict.pth")
+
+    with fabric.device, lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
+        torch.set_default_tensor_type(torch.HalfTensor)
+        model = LLaMA(config).bfloat16()
+        torch.set_default_tensor_type(torch.FloatTensor)
+        # strict=False because missing keys due to LoRA weights not contained in checkpoint state
+        model.load_state_dict(checkpoint, strict=False) 
 
-    # strict=False because missing keys due to LoRA weights not contained in checkpoint state
-    model.load_state_dict(checkpoint, strict=False) 
     mark_only_lora_as_trainable(model)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@@ -93,7 +93,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             ak = ak.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2)
             av = av.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2)
 
-            amask = torch.ones(q.shape[-2], ak.shape[-2], dtype=torch.bool)
+            amask = torch.ones(q.shape[-2], ak.shape[-2], dtype=torch.bool, device=x.device)
             ay = F.scaled_dot_product_attention(q, ak, av, attn_mask=amask, dropout_p=0.0, is_causal=False)
             y = y + self.gating_factor * ay