Lightning-AI · carmocca · Aug 21, 2023 · Jul 13, 2023 · Jul 13, 2023 · Jul 14, 2023
@@ -181,7 +181,7 @@ More details about each finetuning method and how you can apply it to your own d
 These technical tutorials illustrate how to run the finetuning code.
 
 - [Finetune with Adapters](tutorials/finetune_adapter.md)
-- [Finetune with LoRA](tutorials/finetune_lora.md)
+- [Finetune with LoRA or QLoRA](tutorials/finetune_lora.md)
 
 ### Understanding Finetuning -- Conceptual Tutorials
 

diff --git a/finetune/adapter_v2.py b/finetune/adapter_v2.py
@@ -299,4 +299,4 @@ def save_adapter_v2_checkpoint(fabric, model, file_path: Path):
 
     from jsonargparse import CLI
 
-    CLI(setup)
+    CLI(setup)
@@ -2,7 +2,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Optional, List, Dict, Tuple
+from typing import Optional, List, Dict, Tuple, Literal
 
 import lightning as L
 import torch
@@ -15,7 +15,7 @@
 from generate.base import generate
 from lit_gpt.lora import mark_only_lora_as_trainable, lora_filter, GPT, Config, Block
 from lit_gpt.tokenizer import Tokenizer
-from lit_gpt.utils import lazy_load, num_parameters, check_valid_checkpoint_dir, step_csv_logger, chunked_cross_entropy
+from lit_gpt.utils import lazy_load, num_parameters, check_valid_checkpoint_dir, step_csv_logger, chunked_cross_entropy, quantization
 from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor, measure_flops, estimate_flops
 from scripts.prepare_alpaca import generate_prompt
 
@@ -31,7 +31,7 @@
 # Hyperparameters
 learning_rate = 3e-4
 batch_size = 128
-micro_batch_size = 4
+micro_batch_size = 1
 gradient_accumulation_iters = batch_size // micro_batch_size
 assert gradient_accumulation_iters > 0
 max_iters = 50000  # train dataset size
@@ -56,11 +56,17 @@ def setup(
     out_dir: Path = Path("out/lora/alpaca"),
     precision: Optional[str] = None,
     tpu: bool = False,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None,
 ):
     if precision is None:
         precision = "32-true" if tpu else "bf16-mixed"
     fabric_devices = devices
     if fabric_devices > 1:
+        if quantize:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. "
+                "Please set devices=1 when using the --quantization flag."
+            )
         if tpu:
             # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
             fabric_devices = "auto"
@@ -79,10 +85,10 @@ def setup(
     logger = step_csv_logger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval)
     fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger)
     fabric.print(hparams)
-    fabric.launch(main, data_dir, checkpoint_dir, out_dir)
+    fabric.launch(main, data_dir, checkpoint_dir, out_dir, quantize)
 
 
-def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path):
+def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path, quantize: Optional[str] = None):
     check_valid_checkpoint_dir(checkpoint_dir)
 
     speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
@@ -111,7 +117,7 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path):
     )
     checkpoint_path = checkpoint_dir / "lit_model.pth"
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
-    with fabric.init_module(empty_init=False):
+    with fabric.init_module(empty_init=False), quantization(quantize):
         model = GPT(config)
         model.apply(model._init_weights)  # for the LoRA weights
     with lazy_load(checkpoint_path) as checkpoint:
@@ -124,7 +130,11 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path):
     fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}")
     trainable_params = [p for p in model.parameters() if p.requires_grad]
 
-    optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
+    if quantize and quantize.startswith("bnb."):
+        import bitsandbytes as bnb
+        optimizer = bnb.optim.PagedAdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
+    else:
+        optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
     model, optimizer = fabric.setup(model, optimizer)
 
     fabric.seed_everything(1337 + fabric.global_rank)
@@ -226,6 +236,10 @@ def train(
             checkpoint_path = out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             save_lora_checkpoint(fabric, model, checkpoint_path)
 
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+
+
 
 @torch.no_grad()
 def validate(
@@ -315,3 +329,4 @@ def save_lora_checkpoint(fabric, model, file_path: Path):
     from jsonargparse import CLI
 
     CLI(setup)
+
@@ -279,3 +279,61 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge
     assert not layer.merged
     layer.merge()
     assert layer.merged == expected_merged
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Quantization not supported on CPU. Skipping Test.")
+def test_lora_merge_with_quantize():
+    from lit_gpt.lora import mark_only_lora_as_trainable, merge_lora_weights, GPT, Config
+    from lit_gpt.utils import quantization
+    import bitsandbytes as bnb
+
+    config = Config(
+        n_layer=1,
+        n_head=2,
+        n_embd=8,
+        block_size=8,
+        vocab_size=8,
+        r=8,
+        alpha=8,
+        dropout=0.1,
+        to_query=True,
+        to_value=True,
+        to_projection=True,
+    )
+    fabric = Fabric(devices=1, precision="bf16-mixed")
+    with fabric.init_module(empty_init=False), quantization("bnb.nf4"):
+        model = GPT(config)
+        model.apply(model._init_weights)  # for the LoRA weights
+
+    optimizer = bnb.optim.PagedAdamW(model.parameters(), lr=1.0)
+    model, optimizer = fabric.setup(model, optimizer)
+
+    model.train()
+
+    initial_weight = model.transformer.h[0].attn.proj.weight.clone()
+    assert torch.equal(model.transformer.h[0].attn.proj.weight, initial_weight)
+
+    # perform an update to the LoRA weights
+    mark_only_lora_as_trainable(model)
+
+    y = model(torch.randint(0, 8, size=(2, 4), dtype=torch.int64, device=fabric.device))
+    y.sum().backward()
+    optimizer.step()
+    optimizer.zero_grad()
+    # the weight remains unchanged (only lora A and B change)
+    assert torch.equal(model.transformer.h[0].attn.proj.weight, initial_weight)
+
+    # calling merge() multiple times in a row should not merge multiple times
+    merge_lora_weights(model)
+    assert model.transformer.h[0].attn.attn.merged
+    weight_after = model.transformer.h[0].attn.proj.weight.clone()
+    merge_lora_weights(model)
+    merge_lora_weights(model)
+    assert torch.equal(model.transformer.h[0].attn.proj.weight, weight_after)
+
+    # check that `W_after = W_initial + (A x B)`
+    a = model.transformer.h[0].attn.proj.lora_A
+    b = model.transformer.h[0].attn.proj.lora_B
+    scaling = model.transformer.h[0].attn.proj.scaling
+    delta_w = (b @ a) * scaling
+    torch.testing.assert_close(weight_after, initial_weight + delta_w)
@@ -1,4 +1,4 @@
-# Finetuning with LoRA
+# Finetuning with LoRA / QLoRA
 
 [Low-rank adaption (LoRA)](https://arxiv.org/abs/2106.09685) is a technique to approximate the update to the linear layers in a LLM with a low-rank matrix factorization. This significantly reduces the number of trainable parameters and speeds up training with little impact on the final performance of the model.
 We demonstrate this method by instruction-finetuning Lit-GPT StableLM 3B on the [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset on a **single RTX 3090 (24GB) GPU**.
@@ -38,6 +38,18 @@ This script will save checkpoints periodically to the folder `out/`.
 > According to [QLoRA](https://arxiv.org/abs/2305.14314) paper (section 4): "LoRA on all linear transformer block layers are required to match full finetuning performance".
 > By default LoRA is applied only to the `query` and `value` matrices. In order to apply LoRA to other weight matrices - change the variables in `finetune/lora.py` accordingly.
 
+Optionally, finetuning using 4-bit quantization (as in QLoRA) can be enabled via the `--quantize` flag, for example using the 4-bit NormalFloat data type:
+
+```bash
+python finetune/lora.py --quantize "bnb.nf4"
+```
+
+and optionally with double-quantization:
+
+```bash
+python finetune/lora.py --quantize "bnb.nf4-dq"
+```
+
 ## Test the model
 
 You can test the finetuned model with your own instructions by running:
@@ -52,7 +64,7 @@ Output:
 I would recommend the movie The Martian (2015). It is a sci-fi movie starring Matt Damon that follows the story of...
 ```
 
-If your GPU supports `bfloat16`, you can additionally pass `--precision bf16-true` to bring the memory consumption down to ~11 GB for StableLM-3B.
+If your GPU supports `bfloat16`, you can additionally pass `--precision "bf16-true"` to bring the memory consumption down to ~7.6 GB for StableLM-3B (versus ~15.2  GB for `--precision "32-full"`). In addition, you may use quantization methods, for example `--precision "bf16-true" --quantize "bnb.nf4"` brings the memory consumption further down to ~4.4 GB for StableLM-3B.
 
 ## Tune on your dataset