# Fine Tune Test 
Making a test for fine tuning the models as this is my first time doing so

In [12]:
import torch
import numpy as np
from torchtune.models.llama2 import llama2_7b, lora_llama2_7b

# peft_utils has been depricated, make sure that new import is correct
# from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params
from torchtune.modules.peft import get_adapter_params, set_trainable_params



- default settings for both of the models will be the same
- have to define which layers lora will be applied to
- therefore in the example below, we're applying lora to just the query and key in every attention module
- can also apply it to other linear layers within the transformer

In [13]:
base_model = llama2_7b()

# appplying lora to the query and the value projection
lora_llama2_7b = lora_llama2_7b(lora_attn_modules=["q_proj","v_proj"])

Can see in the following print statement that the input and output features are a dot product of [original_in,lora_rank] and [lora_rank,original_out]. Getting the dot product of it would get you the same matrix shape with using a lot less elements

In [14]:
print(base_model.layers[0].attn)
print(lora_llama2_7b.layers[0].attn)

MultiHeadAttention(
  (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (output_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (pos_embeddings): RotaryPositionalEmbeddings()
)
MultiHeadAttention(
  (q_proj): LoRALinear(
    (dropout): Identity()
    (lora_a): Linear(in_features=4096, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=4096, bias=False)
  )
  (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (v_proj): LoRALinear(
    (dropout): Identity()
    (lora_a): Linear(in_features=4096, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=4096, bias=False)
  )
  (output_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (pos_embeddings): RotaryPositionalEmbeddings()
)


In [15]:
# loading base model weights onto the lora model
# should not be a problem at all to do in the first place
lora_llama2_7b.load_state_dict(base_model.state_dict(), strict=False)

_IncompatibleKeys(missing_keys=['layers.0.attn.q_proj.lora_a.weight', 'layers.0.attn.q_proj.lora_b.weight', 'layers.0.attn.v_proj.lora_a.weight', 'layers.0.attn.v_proj.lora_b.weight', 'layers.1.attn.q_proj.lora_a.weight', 'layers.1.attn.q_proj.lora_b.weight', 'layers.1.attn.v_proj.lora_a.weight', 'layers.1.attn.v_proj.lora_b.weight', 'layers.2.attn.q_proj.lora_a.weight', 'layers.2.attn.q_proj.lora_b.weight', 'layers.2.attn.v_proj.lora_a.weight', 'layers.2.attn.v_proj.lora_b.weight', 'layers.3.attn.q_proj.lora_a.weight', 'layers.3.attn.q_proj.lora_b.weight', 'layers.3.attn.v_proj.lora_a.weight', 'layers.3.attn.v_proj.lora_b.weight', 'layers.4.attn.q_proj.lora_a.weight', 'layers.4.attn.q_proj.lora_b.weight', 'layers.4.attn.v_proj.lora_a.weight', 'layers.4.attn.v_proj.lora_b.weight', 'layers.5.attn.q_proj.lora_a.weight', 'layers.5.attn.q_proj.lora_b.weight', 'layers.5.attn.v_proj.lora_a.weight', 'layers.5.attn.v_proj.lora_b.weight', 'layers.6.attn.q_proj.lora_a.weight', 'layers.6.attn.q_p

In [16]:
# setting lora params to be trainable
lora_params = get_adapter_params(lora_llama2_7b)

set_trainable_params(lora_llama2_7b, lora_params)

# numel gets the total number of random elements
total_params  = sum([p.numel() for p in lora_llama2_7b.parameters()])
trainable_params = sum([p.numel() for p in lora_llama2_7b.parameters() if p.requires_grad])

print(
  f"""
  {total_params} total params,
  {trainable_params}" trainable params,
  {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.
  """
)


  6742609920 total params,
  4194304" trainable params,
  0.06% of all params are trainable.
  
