In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from LoRA import LoRA_Linear
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)

In [2]:
def get_trainable_parameters(module:nn.Module):
    count = 0
    for param in module.parameters():
        if param.requires_grad:
            count += param.numel()
    return count

In [3]:
def apply_LoRA_tinyllama(target_layers=["q_proj","k_proj","v_proj","o_proj"], r=4):
    model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
    model = AutoModelForCausalLM.from_pretrained(model_id)
    count = get_trainable_parameters(model)
    print("Before Trainable parameters: %d"%count)

    for param in model.parameters():
        param.requires_grad = False
    n_layers = len(model.model.layers)
    
    for i in range(n_layers):
        if "q_proj" in target_layers:
            model.model.layers[i].self_attn.q_proj = LoRA_Linear(model.model.layers[i].self_attn.q_proj, r=r)
        if "k_proj" in target_layers:
            model.model.layers[i].self_attn.k_proj = LoRA_Linear(model.model.layers[i].self_attn.k_proj, r=r)
        if "v_proj" in target_layers:
            model.model.layers[i].self_attn.v_proj = LoRA_Linear(model.model.layers[i].self_attn.v_proj, r=r)
        if "o_proj" in target_layers:
            model.model.layers[i].self_attn.o_proj = LoRA_Linear(model.model.layers[i].self_attn.o_proj, r=r)

    device = torch.device("cuda")
    model.to(device)

    count = get_trainable_parameters(model)
    print("After Trainable parameters: %d"%count)
    return model

In [4]:
LoRA_llama = apply_LoRA_tinyllama(target_layers=["q_proj","k_proj","v_proj","o_proj"], r=4)

Before Trainable parameters: 1100048384
After Trainable parameters: 1126400


In [5]:
LoRA_llama

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): LoRA_Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (B): Linear(in_features=2048, out_features=4, bias=False)
            (A): Linear(in_features=4, out_features=2048, bias=False)
          )
          (k_proj): LoRA_Linear(
            (base_layer): Linear(in_features=2048, out_features=256, bias=False)
            (B): Linear(in_features=2048, out_features=4, bias=False)
            (A): Linear(in_features=4, out_features=256, bias=False)
          )
          (v_proj): LoRA_Linear(
            (base_layer): Linear(in_features=2048, out_features=256, bias=False)
            (B): Linear(in_features=2048, out_features=4, bias=False)
            (A): Linear(in_features=4, out_features=256, bias=False)
          )
          (o_p

In [6]:
B,L = 4,256
vocsize = tokenizer.vocab_size
device = torch.device("cuda")
random_inputs = torch.randint(low=0, high=vocsize, size=(B,L)).to(device)

In [7]:
with torch.no_grad():
    outputs = LoRA_llama(random_inputs)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [12]:
outputs.logits.shape

torch.Size([4, 256, 32000])