In [None]:
# checkpoints/llama_130m-2023-05-16-18-18-14

import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
from peft_pretraining.relora import ReLoRaModel

In [None]:
!ls ../checkpoints/llama_130m-2023-05-14-19-54-05/model_5000

In [None]:
model_5K = LlamaForCausalLM.from_pretrained("../checkpoints/llama_130m-2023-05-14-19-54-05/model_5000")
model_20K = ReLoRaModel.from_pretrained("../checkpoints/llama_130m-2023-05-16-18-18-14/model_20000")
full_model_20K = LlamaForCausalLM.from_pretrained("../checkpoints/llama_130m-2023-05-14-19-54-05/model_20000")

full_peft = ReLoRaModel.from_pretrained("../checkpoints/llama_130m-2023-05-09-19-17-11/model_10000")

In [None]:
import torch
from tqdm import tqdm

# get singular values of all layers
q_projs = []
k_projs = []
v_projs = []
o_projs = []
gate_projs = []
down_projs = []
up_projs = []

for layer in tqdm(model_5K.model.layers):
    q_projs_weight = layer.self_attn.q_proj.weight.detach().cuda()
    singular_values = torch.svd(q_projs_weight).S.cpu().numpy()
    q_projs.append(singular_values)

    k_projs_weight = layer.self_attn.k_proj.weight.detach().cuda()
    singular_values = torch.svd(k_projs_weight).S.cpu().numpy()
    k_projs.append(singular_values)

    v_projs_weight = layer.self_attn.v_proj.weight.detach().cuda()
    singular_values = torch.svd(v_projs_weight).S.cpu().numpy()
    v_projs.append(singular_values)

    o_projs_weight = layer.self_attn.o_proj.weight.detach().cuda()
    singular_values = torch.svd(o_projs_weight).S.cpu().numpy()
    o_projs.append(singular_values)

    gate_projs_weight = layer.mlp.gate_proj.weight.detach().cuda()
    singular_values = torch.svd(gate_projs_weight).S.cpu().numpy()
    gate_projs.append(singular_values)

    down_projs_weight = layer.mlp.down_proj.weight.detach().cuda()
    singular_values = torch.svd(down_projs_weight).S.cpu().numpy()
    down_projs.append(singular_values)

    up_projs_weight = layer.mlp.up_proj.weight.detach().cuda()
    singular_values = torch.svd(up_projs_weight).S.cpu().numpy()
    up_projs.append(singular_values)

In [None]:
import torch
from tqdm import tqdm

def get_linear_weight_from_relora(relora_layer):
    return relora_layer.weight + relora_layer.lora_B.weight @ relora_layer.lora_A.weight * relora_layer.scaling

# get singular values of all layers
peft_q_projs = []
peft_k_projs = []
peft_v_projs = []
peft_o_projs = []
peft_gate_projs = []
peft_down_projs = []
peft_up_projs = []

for layer in tqdm(model_20K.wrapped_model.model.layers):
    q_projs_weight = get_linear_weight_from_relora(layer.self_attn.q_proj).detach()
    singular_values = torch.svd(q_projs_weight).S
    peft_q_projs.append(singular_values)

    k_projs_weight = get_linear_weight_from_relora(layer.self_attn.k_proj).detach()
    singular_values = torch.svd(k_projs_weight).S
    peft_k_projs.append(singular_values)

    v_projs_weight = get_linear_weight_from_relora(layer.self_attn.v_proj).detach()
    singular_values = torch.svd(v_projs_weight).S
    peft_v_projs.append(singular_values)

    o_projs_weight = get_linear_weight_from_relora(layer.self_attn.o_proj).detach()
    singular_values = torch.svd(o_projs_weight).S
    peft_o_projs.append(singular_values)

    gate_projs_weight = get_linear_weight_from_relora(layer.mlp.gate_proj).detach()
    singular_values = torch.svd(gate_projs_weight).S
    peft_gate_projs.append(singular_values)

    down_projs_weight = get_linear_weight_from_relora(layer.mlp.down_proj).detach()
    singular_values = torch.svd(down_projs_weight).S
    peft_down_projs.append(singular_values)

    up_projs_weight = get_linear_weight_from_relora(layer.mlp.up_proj).detach()
    singular_values = torch.svd(up_projs_weight).S
    peft_up_projs.append(singular_values)

In [None]:
# now, deltas

q_projs_delta = []
k_projs_delta = []
v_projs_delta = []
o_projs_delta = []
gate_projs_delta = []
down_projs_delta = []
up_projs_delta = []

for _5k, _20k in zip(model_5K.model.layers, model_20K.wrapped_model.model.layers):
    q_projs_weight_5k = (_5k.self_attn.q_proj).weight.detach()
    q_projs_weight_20k = get_linear_weight_from_relora(_20k.self_attn.q_proj).detach()
    q_projs_weight_delta = q_projs_weight_20k - q_projs_weight_5k
    singular_values = torch.svd(q_projs_weight_delta).S
    q_projs_delta.append(singular_values)

    k_projs_weight_5k = (_5k.self_attn.k_proj).weight.detach()
    k_projs_weight_20k = get_linear_weight_from_relora(_20k.self_attn.k_proj).detach()
    k_projs_weight_delta = k_projs_weight_20k - k_projs_weight_5k
    singular_values = torch.svd(k_projs_weight_delta).S
    k_projs_delta.append(singular_values)

    v_projs_weight_5k = (_5k.self_attn.v_proj).weight.detach()
    v_projs_weight_20k = get_linear_weight_from_relora(_20k.self_attn.v_proj).detach()
    v_projs_weight_delta = v_projs_weight_20k - v_projs_weight_5k
    singular_values = torch.svd(v_projs_weight_delta).S
    v_projs_delta.append(singular_values)

    o_projs_weight_5k = (_5k.self_attn.o_proj).weight.detach()
    o_projs_weight_20k = get_linear_weight_from_relora(_20k.self_attn.o_proj).detach()
    o_projs_weight_delta = o_projs_weight_20k - o_projs_weight_5k
    singular_values = torch.svd(o_projs_weight_delta).S
    o_projs_delta.append(singular_values)

    gate_projs_weight_5k = (_5k.mlp.gate_proj).weight.detach()
    gate_projs_weight_20k = get_linear_weight_from_relora(_20k.mlp.gate_proj).detach()
    gate_projs_weight_delta = gate_projs_weight_20k - gate_projs_weight_5k
    singular_values = torch.svd(gate_projs_weight_delta).S
    gate_projs_delta.append(singular_values)

    down_projs_weight_5k = (_5k.mlp.down_proj).weight.detach()
    down_projs_weight_20k = get_linear_weight_from_relora(_20k.mlp.down_proj).detach()
    down_projs_weight_delta = down_projs_weight_20k - down_projs_weight_5k
    singular_values = torch.svd(down_projs_weight_delta).S
    down_projs_delta.append(singular_values)

    up_projs_weight_5k = (_5k.mlp.up_proj).weight.detach()
    up_projs_weight_20k = get_linear_weight_from_relora(_20k.mlp.up_proj).detach()
    up_projs_weight_delta = up_projs_weight_20k - up_projs_weight_5k
    singular_values = torch.svd(up_projs_weight_delta).S
    up_projs_delta.append(singular_values)

In [None]:
import torch
from tqdm import tqdm

def get_linear_weight_from_relora(relora_layer):
    return relora_layer.lora_B.weight @ relora_layer.lora_A.weight * relora_layer.scaling

# get singular values of all layers
full_peft_q_projs = []
full_peft_k_projs = []
full_peft_v_projs = []
full_peft_o_projs = []
full_peft_gate_projs = []
full_peft_down_projs = []
full_peft_up_projs = []

for layer in tqdm(full_peft.wrapped_model.model.layers):
    q_projs_weight = get_linear_weight_from_relora(layer.self_attn.q_proj).detach()
    singular_values = torch.svd(q_projs_weight).S
    full_peft_q_projs.append(singular_values)

    k_projs_weight = get_linear_weight_from_relora(layer.self_attn.k_proj).detach()
    singular_values = torch.svd(k_projs_weight).S
    full_peft_k_projs.append(singular_values)

    v_projs_weight = get_linear_weight_from_relora(layer.self_attn.v_proj).detach()
    singular_values = torch.svd(v_projs_weight).S
    full_peft_v_projs.append(singular_values)

    o_projs_weight = get_linear_weight_from_relora(layer.self_attn.o_proj).detach()
    singular_values = torch.svd(o_projs_weight).S
    full_peft_o_projs.append(singular_values)

    gate_projs_weight = get_linear_weight_from_relora(layer.mlp.gate_proj).detach()
    singular_values = torch.svd(gate_projs_weight).S
    full_peft_gate_projs.append(singular_values)

    down_projs_weight = get_linear_weight_from_relora(layer.mlp.down_proj).detach()
    singular_values = torch.svd(down_projs_weight).S
    full_peft_down_projs.append(singular_values)

    up_projs_weight = get_linear_weight_from_relora(layer.mlp.up_proj).detach()
    singular_values = torch.svd(up_projs_weight).S
    full_peft_up_projs.append(singular_values)

In [None]:
# delta between full_model_20K and model_5K

q_projs_delta_full = []
k_projs_delta_full = []
v_projs_delta_full = []
o_projs_delta_full = []
gate_projs_delta_full = []
down_projs_delta_full = []
up_projs_delta_full = []

for layer_20k, layer_5k in zip(full_model_20K.model.layers, model_5K.model.layers):
    q_projs_weight_20k = layer_20k.self_attn.q_proj.weight.detach()
    q_projs_weight_5k = layer_5k.self_attn.q_proj.weight.detach()
    q_projs_weight_delta = q_projs_weight_20k - q_projs_weight_5k
    singular_values = torch.svd(q_projs_weight_delta).S
    q_projs_delta_full.append(singular_values)

    k_projs_weight_20k = layer_20k.self_attn.k_proj.weight.detach()
    k_projs_weight_5k = layer_5k.self_attn.k_proj.weight.detach()
    k_projs_weight_delta = k_projs_weight_20k - k_projs_weight_5k
    singular_values = torch.svd(k_projs_weight_delta).S
    k_projs_delta_full.append(singular_values)

    v_projs_weight_20k = layer_20k.self_attn.v_proj.weight.detach()
    v_projs_weight_5k = layer_5k.self_attn.v_proj.weight.detach()
    v_projs_weight_delta = v_projs_weight_20k - v_projs_weight_5k
    singular_values = torch.svd(v_projs_weight_delta).S
    v_projs_delta_full.append(singular_values)

    o_projs_weight_20k = layer_20k.self_attn.o_proj.weight.detach()
    o_projs_weight_5k = layer_5k.self_attn.o_proj.weight.detach()
    o_projs_weight_delta = o_projs_weight_20k - o_projs_weight_5k
    singular_values = torch.svd(o_projs_weight_delta).S
    o_projs_delta_full.append(singular_values)

    gate_projs_weight_20k = layer_20k.mlp.gate_proj.weight.detach()
    gate_projs_weight_5k = layer_5k.mlp.gate_proj.weight.detach()
    gate_projs_weight_delta = gate_projs_weight_20k - gate_projs_weight_5k
    singular_values = torch.svd(gate_projs_weight_delta).S
    gate_projs_delta_full.append(singular_values)

    down_projs_weight_20k = layer_20k.mlp.down_proj.weight.detach()
    down_projs_weight_5k = layer_5k.mlp.down_proj.weight.detach()
    down_projs_weight_delta = down_projs_weight_20k - down_projs_weight_5k
    singular_values = torch.svd(down_projs_weight_delta).S
    down_projs_delta_full.append(singular_values)

    up_projs_weight_20k = layer_20k.mlp.up_proj.weight.detach()
    up_projs_weight_5k = layer_5k.mlp.up_proj.weight.detach()
    up_projs_weight_delta = up_projs_weight_20k - up_projs_weight_5k
    singular_values = torch.svd(up_projs_weight_delta).S
    up_projs_delta_full.append(singular_values)


In [None]:
# plot histogram of singular values for q_projs over layers
from matplotlib import pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
ax.set_title("Singular Values of Q Projections")
ax.set_xlabel("Singular Value")
ax.set_ylabel("Frequency")

ax.hist(torch.cat(q_projs_delta).numpy(), density=True, bins=100, alpha=0.9, label="ReLoRA Delta")
ax.hist(torch.cat(full_peft_q_projs).numpy(), density=True, bins=100, alpha=0.5, label="LoRA Delta")
ax.hist(torch.cat(q_projs_delta_full).numpy(), density=True, bins=100, alpha=0.3, label="Delta between full models")

# print numbers of singular values < 0.1
print("ReLoRA Delta: ", (torch.cat(q_projs_delta).numpy() < 0.1).sum())
print("LoRA Delta: ", (torch.cat(full_peft_q_projs).numpy() < 0.1).sum())
print("Delta between full models: ", (torch.cat(q_projs_delta_full).numpy() < 0.1).sum())

# ylim
ax.set_ylim(0, 3)
ax.set_xlim(0, 4)

ax.legend()

# pdf
plt.savefig("q_proj_singular_values.pdf", bbox_inches='tight')
plt.show()

In [None]:
# plot histogram of singular values for q_projs over layers
from matplotlib import pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
ax.set_title("Singular Values of Up Projections")
ax.set_xlabel("Singular Value")
ax.set_ylabel("Frequency")

ax.hist(torch.cat(up_projs_delta).numpy(), density=True, bins=100, alpha=0.9, label="ReLoRA Delta")
ax.hist(torch.cat(full_peft_up_projs).numpy(), density=True, bins=100, alpha=0.5, label="LoRA Delta")
ax.hist(torch.cat(up_projs_delta_full).numpy(), density=True, bins=100, alpha=0.3, label="Delta between full models")

# print numbers of singular values < 0.1
print("ReLoRA Delta: ", (torch.cat(up_projs_delta).numpy() < 0.1).sum())
print("LoRA Delta: ", (torch.cat(full_peft_up_projs).numpy() < 0.1).sum())
print("Delta between full models: ", (torch.cat(up_projs_delta_full).numpy() < 0.1).sum())

# # ylim
ax.set_ylim(0, 3)
ax.set_xlim(0, 2)

ax.legend()

# pdf
plt.savefig("up_proj_singular_values.pdf", bbox_inches='tight')
fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
ax.set_title("Singular Values of V Projections")
ax.set_xlabel("Singular Value")
ax.set_ylabel("Frequency")

ax.hist(torch.cat(v_projs_delta).numpy(), density=True, bins=100, alpha=0.9, label="ReLoRA Delta")
ax.hist(torch.cat(full_peft_v_projs).numpy(), density=True, bins=100, alpha=0.5, label="LoRA Delta")
ax.hist(torch.cat(v_projs_delta_full).numpy(), density=True, bins=100, alpha=0.3, label="Delta between full models")

# Print the number of singular values < 0.1
print("ReLoRA Delta: ", (torch.cat(v_projs_delta).numpy() < 0.1).sum())
print("LoRA Delta: ", (torch.cat(full_peft_v_projs).numpy() < 0.1).sum())
print("Delta between full models: ", (torch.cat(v_projs_delta_full).numpy() < 0.1).sum())

# Set the y-axis limits
ax.set_ylim(0, 3)
ax.set_xlim(0, 2)

ax.legend()

# Save the figure as a PDF
plt.savefig("v_proj_singular_values.pdf", bbox_inches='tight')
fig.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
ax.set_title("Singular Values of Down Projections")
ax.set_xlabel("Singular Value")
ax.set_ylabel("Frequency")

ax.hist(torch.cat(down_projs_delta).numpy(), density=True, bins=100, alpha=0.9, label="ReLoRA Delta")
ax.hist(torch.cat(full_peft_down_projs).numpy(), density=True, bins=100, alpha=0.5, label="LoRA Delta")
ax.hist(torch.cat(down_projs_delta_full).numpy(), density=True, bins=100, alpha=0.3, label="Delta between full models")

# Print the number of singular values < 0.1
print("ReLoRA Delta: ", (torch.cat(down_projs_delta).numpy() < 0.1).sum())
print("LoRA Delta: ", (torch.cat(full_peft_down_projs).numpy() < 0.1).sum())
print("Delta between full models: ", (torch.cat(down_projs_delta_full).numpy() < 0.1).sum())

# Set the y-axis limits
ax.set_ylim(0, 3)
ax.set_xlim(0, 2)

ax.legend()

# Save the figure as a PDF
plt.savefig("down_proj_singular_values.pdf", bbox_inches='tight')
fig.show()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 4), dpi=150)
# Set font size
plt.rcParams.update({'font.size': 16})

titles = ["Q Projections", "V Projections", "Up Projections", "Down Projections"]
delta_data = [q_projs_delta, v_projs_delta, up_projs_delta, down_projs_delta]
full_delta_data = [q_projs_delta_full, v_projs_delta_full, up_projs_delta_full, down_projs_delta_full]
lora_delta_data = [full_peft_q_projs, full_peft_v_projs, full_peft_up_projs, full_peft_down_projs]

for i, ax in enumerate(axes):
    ax.set_title(titles[i])
    ax.set_xlabel("Singular Value", fontsize=16)
    if i == 0:
        ax.set_ylabel("Frequency", fontsize=16)

    ax.hist(torch.cat(delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.9, label="ReLoRA")
    ax.hist(torch.cat(lora_delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.5, label="LoRA")
    ax.hist(torch.cat(full_delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.3, label="Full-rank\ntraining")

    # Print the number of singular values < 0.1
    print(f"Number of singular values < 0.1 ReLoRA ({titles[i]}): ", (torch.cat(delta_data[i]).numpy() < 0.1).sum())
    print(f"Number of singular values < 0.1 full-rank training ({titles[i]}): ", (torch.cat(full_delta_data[i]).numpy() < 0.1).sum())
    print(f"Number of singular values < 0.1 LoRA ({titles[i]}): ", (torch.cat(lora_delta_data[i]).numpy() < 0.1).sum())

    # Set the y-axis limits
    ax.set_ylim(0, 3)
    ax.set_xlim(0, 2)

    # Set legend font size
    ax.legend(fontsize=16)

# Save the figure as a PDF
plt.savefig("projection_singular_values.pdf", bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(16, 4), dpi=150)
# Set font size
plt.rcParams.update({'font.size': 16})

titles = ["Q Projections", "K Projections", "V Projections", "Up Projections", "Down Projections"]
delta_data = [q_projs_delta, k_projs_delta, v_projs_delta, up_projs_delta, down_projs_delta]
full_delta_data = [q_projs_delta_full, k_projs_delta_full, v_projs_delta_full, up_projs_delta_full, down_projs_delta_full]
lora_delta_data = [full_peft_q_projs, full_peft_q_projs, full_peft_v_projs, full_peft_up_projs, full_peft_down_projs]

for i, ax in enumerate(axes):
    ax.set_title(titles[i])
    ax.set_xlabel("Singular Value", fontsize=16)
    if i == 0:
        ax.set_ylabel("Frequency", fontsize=16)

    ax.hist(torch.cat(delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.9, label="ReLoRA")
    ax.hist(torch.cat(lora_delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.5, label="LoRA")
    ax.hist(torch.cat(full_delta_data[i]).numpy(), density=True, bins=50, range=(0, 2), alpha=0.3, label="Full-rank\ntraining")

    # Print the number of singular values < 0.1
    print(f"Number of singular values < 0.1 ReLoRA ({titles[i]}): ", (torch.cat(delta_data[i]).numpy() < 0.1).sum())
    print(f"Number of singular values < 0.1 full-rank training ({titles[i]}): ", (torch.cat(full_delta_data[i]).numpy() < 0.1).sum())
    print(f"Number of singular values < 0.1 LoRA ({titles[i]}): ", (torch.cat(lora_delta_data[i]).numpy() < 0.1).sum())

    # Set the y-axis limits
    ax.set_ylim(0, 3)
    ax.set_xlim(0, 2)

    # Set legend font size
    ax.legend(fontsize=16)

# Save the figure as a PDF
plt.savefig("projection_singular_values.pdf", bbox_inches='tight')
plt.show()