In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import bitsandbytes as bnb
import bitsandbytes.functional as bnbF

from peft_pretraining.modeling_llama import LlamaForCausalLM
from peft_pretraining.relora import ReLoRaModel

from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
orig_model = LlamaForCausalLM.from_pretrained("../checkpoints/llama_250m-2023-06-09-11-29-56_up_to_5K/model_5000")#, load_in_8bit=True)#, load_in_4bit=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
input_ids = tokenizer("Why am I doing this?", return_tensors="pt").input_ids
# orig_out = orig_model(input_ids=input_ids)

In [4]:
model = ReLoRaModel(
    orig_model,
    r=128,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["attn", "attention", "mlp"],
    trainable_scaling=False,
    keep_original_weights=True,
    quantize4bit=True,
    use_double_quant=True,
)
model = model.to(dtype=torch.bfloat16, device="cuda")

In [5]:
trainable_parameters = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(trainable_parameters, lr=0.001)

In [6]:
input_ids = input_ids.cuda()
quantized_out = model(input_ids, labels=input_ids)

In [7]:
loss = quantized_out.loss
loss.backward()

In [8]:
optimizer.step()

In [13]:
weight = model.wrapped_model.model.layers[0].self_attn.q_proj.weight

In [14]:
weight_data_fp = bnb.functional.dequantize_4bit(weight.data, weight.quant_state)

In [15]:
weight_data_fp

tensor([[ 0.0106,  0.0106,  0.0656,  ..., -0.0438, -0.0330,  0.0354],
        [ 0.0519,  0.0317,  0.0404,  ..., -0.0113,  0.0270, -0.0056],
        [-0.0621,  0.0349, -0.0326,  ...,  0.0363,  0.0104,  0.0218],
        ...,
        [ 0.0286, -0.0145, -0.0267,  ...,  0.0047,  0.0199, -0.0309],
        [-0.0207, -0.0048,  0.0231,  ...,  0.0368,  0.0368, -0.0186],
        [-0.0327, -0.0246, -0.0057,  ..., -0.0520,  0.0293,  0.0000]],
       device='cuda:0', dtype=torch.float16)

In [9]:
model.wrapped_model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
Parameter(Params4bit([[153],
            [247],
            [114],
            ...,
            [198],
            [ 48],
            [215]], device='cuda:0', dtype=torch.uint8))

In [12]:
model

ReLoRaModel(
  (wrapped_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 768, padding_idx=31999)
      (layers): ModuleList(
        (0-23): 24 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): ReLoRaLinear(
              (lora_dropout): Dropout(p=0.1, inplace=False)
              (lora_A): Linear(in_features=768, out_features=128, bias=False)
              (lora_B): Linear(in_features=128, out_features=768, bias=False)
            )
            (k_proj): ReLoRaLinear(
              (lora_dropout): Dropout(p=0.1, inplace=False)
              (lora_A): Linear(in_features=768, out_features=128, bias=False)
              (lora_B): Linear(in_features=128, out_features=768, bias=False)
            )
            (v_proj): ReLoRaLinear(
              (lora_dropout): Dropout(p=0.1, inplace=False)
              (lora_A): Linear(in_features=768, out_features=128, bias=False)
              (lora_B): Linear(in_features=128, o

In [7]:
orig_out.logits

tensor([[[-4.1537,  1.7079,  0.5386,  ..., -3.2834, -3.8020, -4.5960],
         [-3.6189,  3.0790,  1.9448,  ..., -3.3880, -4.8104, -4.4347],
         [-3.0543,  2.4709,  0.8437,  ..., -3.1641, -4.6998, -4.3173],
         ...,
         [-4.4516,  1.1609,  1.1042,  ..., -5.4319, -4.2324, -5.0842],
         [-4.5135,  5.6720,  2.0341,  ..., -1.5236, -4.5093, -4.6042],
         [-4.4559,  0.2325,  2.2894,  ..., -2.6422, -5.2159, -4.2815]]],
       grad_fn=<UnsafeViewBackward0>)

In [19]:
torch.dist(hf4bit_out.logits, orig_out.logits.cuda(), p=2)

tensor(15.6308, device='cuda:0', grad_fn=<DistBackward0>)

In [8]:
torch.dist(orig_out.logits.cpu(), quantized_out.logits.cpu(), p=2)

tensor(83.9014, grad_fn=<DistBackward0>)

### Debug/bnb

In [11]:
in_features = 128
out_features = 64
use_double_quant = False

weight = bnb.nn.Linear4bit(
    in_features,
    out_features,
    bias=False,
    compute_dtype=torch.bfloat16,
    compress_statistics=use_double_quant,
    quant_type="nf4",
)
bias = torch.tensor(out_features, dtype=torch.bfloat16, requires_grad=True, device="cuda")
weight = weight.to("cuda")

lora_A = nn.Linear(in_features, 1, bias=False).to("cuda", dtype=torch.bfloat16)
lora_B = nn.Linear(1, out_features, bias=False).to("cuda", dtype=torch.bfloat16)

In [13]:
x = torch.randn(2, in_features, device="cuda", dtype=torch.bfloat16)
y = weight(x) + bias
y = y + lora_B(lora_A(x))

loss = y.sum()
loss.backward()

In [16]:
orig_weight = torch.randn(in_features, out_features)

In [19]:
net = nn.Sequential(
    nn.Linear(in_features, out_features),
    nn.ReLU(),
    nn.Linear(out_features, out_features),
)

In [20]:
net[0].weight = bnb.nn.Params4bit(net[0].weight.data)

In [17]:
quantized_weight = bnb.nn.Params4bit(orig_weight.data, requires_grad=False, compress_statistics=False, quant_type="nf4")

In [18]:
quantized_weight

Parameter containing:
Parameter(Params4bit([[ 0.5485, -0.2513,  0.2402,  ..., -0.7881, -0.4519, -1.0543],
            [-0.3215, -0.1178, -0.0623,  ..., -0.2657, -0.2037,  3.4480],
            [ 1.4118, -1.0065,  1.5193,  ..., -1.7599,  1.3230, -1.3040],
            ...,
            [ 1.5272,  1.4868,  0.7169,  ..., -0.0711, -0.4521,  0.9336],
            [-0.0707, -1.3644,  1.0509,  ...,  0.7394, -1.6139, -0.9520],
            [ 1.7725, -1.4115,  1.2637,  ...,  0.4864,  1.9556, -0.5330]]))