In [2]:
import torch
from transformers import AutoTokenizer, LlamaForCausalLM
#tokenizer = AlbertTokenizer.from_pretrained('albert-xlarge-v2')
def get_model(model_name):
    if model_name == "llama":
        base_model = "meta-llama/Llama-2-7b-hf"
    elif model_name == "llama_chat":
        base_model = "meta-llama/Llama-2-7b-chat-hf"
    elif model_name == "vicuna":
        base_model = "lmsys/vicuna-7b-v1.5"
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        low_cpu_mem_usage=True, 
        #device_map="auto"
    )
    return model

model =  get_model("llama")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
import numpy as np
# Function to print details of each module and its parameters
def print_model_parameters(model):
    total = 0
    for name, module in model.named_modules():
        print(f"Module: {name}")
        for param_name, param in module.named_parameters(recurse=False):
            print(f"\t  Parameter: {param_name}, Shape: {param.shape}, Requires Grad: {param.requires_grad}")
            total += np.prod(param.shape.to_list())
        print("")

# Print model architecture and parameters
print_model_parameters(model)


Module: 

Module: model

Module: model.embed_tokens
	  Parameter: weight, Shape: torch.Size([32000, 4096]), Requires Grad: True

Module: model.layers

Module: model.layers.0

Module: model.layers.0.self_attn

Module: model.layers.0.self_attn.q_proj
	  Parameter: weight, Shape: torch.Size([4096, 4096]), Requires Grad: True

Module: model.layers.0.self_attn.k_proj
	  Parameter: weight, Shape: torch.Size([4096, 4096]), Requires Grad: True

Module: model.layers.0.self_attn.v_proj
	  Parameter: weight, Shape: torch.Size([4096, 4096]), Requires Grad: True

Module: model.layers.0.self_attn.o_proj
	  Parameter: weight, Shape: torch.Size([4096, 4096]), Requires Grad: True

Module: model.layers.0.self_attn.rotary_emb

Module: model.layers.0.mlp

Module: model.layers.0.mlp.gate_proj
	  Parameter: weight, Shape: torch.Size([11008, 4096]), Requires Grad: True

Module: model.layers.0.mlp.up_proj
	  Parameter: weight, Shape: torch.Size([11008, 4096]), Requires Grad: True

Module: model.layers.0.mlp.d

In [3]:
import math
import matplotlib.pyplot as plt
plt.rcParams.update(plt.rcParamsDefault)

# returns Bm, Vm whose meaning is this:
# 
# x_i (t+1) = x_i(t) + \sum_m \sum_j 1/Z_{i,m} exp(<x_i, Bm x_j>) Vm x_j
#
# Here tokens are normalized so that ||x_i||=\sqrt{d_model} (i.e. not on a unit sphere).
#
def albert_get_BV(al_model, head_idx=0):
    al_transfo = al_model.encoder;
    al_layer = al_transfo.albert_layer_groups[0].albert_layers[0];
    al_attention = al_layer.attention;
    
    # These matrices that act on token x (row-vector) by x \times WQ etc 
    WQ = al_attention.query.weight.T;
    WV = al_attention.value.weight.T;
    WK = al_attention.key.weight.T;
    D = al_attention.dense.weight.T;
    
    dk = al_attention.attention_head_size; 
    dmodel = al_attention.hidden_size;
    m = head_idx;
    
    
    WQm = WQ[:, (dk*m):(dk*m + dk)]; WKm = WK[:, (dk*m):(dk*m + dk)]; 
    WVm = torch.zeros(dmodel,dmodel);
    WVm[:, (dk*m):(dk*m + dk)] = WV[:, (dk*m):(dk*m + dk)]; 
    
    # bilinear form matrix
    b_mtx = torch.matmul(WQm, WKm.T); b_mtx = 0.5* (b_mtx + b_mtx.T) / math.sqrt(dk);
    
    # Convert value matrix to a matrix acting on tokens as column vectors:
    value_mtx = torch.matmul(WVm, D).T; 
    
    return b_mtx.clone().detach(), value_mtx.clone().detach();


def plot_B_spectra(al_model):
    assert(al_model.config.num_attention_heads == 16) | (al_model.config.num_attention_heads == 1) ;

    print('Note that these matrices act on token vectors normalized to ||x||=sqrt(2048)')
    
    if al_model.config.num_attention_heads == 16:
        heads = 16;
        fig, axes = plt.subplots(4, 4);
        axes = axes.flatten();
    elif al_model.config.num_attention_heads == 1:
        heads = 1;
        fig, axes = plt.subplots(1);
        axes = [axes,];
    else:
        raise AssertionError('num heads');
    
    minx = 0; maxx = 0;
    
    betas = [];
    dmodel = al_model.config.hidden_size;
    assert(dmodel == 2048);
    
    for i in range(heads):
        B, _ = albert_get_BV(al_model, i);
        eigs = torch.linalg.eigvalsh(B); 
        eigs = eigs[eigs.abs() > 1e-6]; 
        
        if heads == 16:
            assert len(eigs) == 256;

        axes[i].hist(eigs,bins=40, density=True);
        axes[i].set_title(f'head {i}');
        axes[i].set_ylim(0,8);
        
        minx = min(minx, eigs.min());
        maxx = max(maxx, eigs.max());
        
        eff_beta = math.sqrt((B.flatten()**2).sum() * dmodel);
        betas += [eff_beta,];
        
    for i in range(heads):
        axes[i].set_xlim(minx,maxx)
    
    #  This is computed as that beta which would yield the same typical RMSE value (for a pair of indep isotropic token)
    #  in the model with exp(<x_i, x_j>\beta / d_model), where again x_i has N(0,1) coordinates.
    print('Effective betas = ', betas)
    
    
def plot_V_spectra(al_model):
    print(al_model.config.num_attention_heads )

    print('Note that these matrices act on token vectors normalized to ||x||=sqrt(2048)')
    
    print(al_model.config.num_attention_heads)
    heads = al_model.config.num_attention_heads 
    
    minx = 0; maxx = 0;
    
    dmodel = al_model.config.hidden_size;
  
    
    for i in range(heads):
        _, V = albert_get_BV(al_model, i);
        eigs = torch.linalg.eigvals(V); 
        eigs = eigs[eigs.abs() > 1e-4]; 
        
        if heads == 16:
            print(f'Head = {i}, non-zero eigs = {len(eigs)}')
            #assert len(eigs) == 128;

        minx = min(minx, eigs.real.min());
        maxx = max(maxx, eigs.real.max());

        plt.figure()
        #plt.plot(eigs.real, eigs.imag, 'k.');

        label_size = 16
        plt.rcParams['xtick.labelsize'] = label_size
        plt.rcParams['ytick.labelsize'] = label_size

        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['top'].set_visible(False)

        plt.grid(color='silver', linestyle=':', linewidth=0.15, zorder=3)
        plt.gca().set_axisbelow(True)


        #print(eigs.real.shape)
        plt.scatter([x for x in eigs.real], 
                    [x for x in eigs.imag], 
                    color='crimson', 
                    linewidth=0.75, 
                    edgecolors='black')
        title = 'Eigenvalues of value matrix for head %s' % int(i+1)
        plt.title(title);
        plt.xlim(-2.25-0.1, 1.5+0.1)
        plt.ylim(-1.5-0.1, 1.5+0.1)
        plt.gca().set_aspect('equal', adjustable='box')
        #axes[i].set_ylim(0,8);
        
        base_filename = "eigs" + "{}.pdf".format(i+1)
        print(i)
        plt.savefig(base_filename, 
                    format='pdf', 
                    bbox_inches='tight')
        plt.show()
        plt.clf()
        plt.close()
            

In [4]:

print("Got Llama")
plot_V_spectra(al_model)

Got Llama
32
Note that these matrices act on token vectors normalized to ||x||=sqrt(2048)
32


AttributeError: 'LlamaForCausalLM' object has no attribute 'encoder'