In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name_or_path = 'meta-llama/Llama-2-7b-chat-hf'
model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.float16,
        device_map='auto'
        ).eval()
    
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.unk_token if tokenizer.pad_token is None else tokenizer.pad_token

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


In [2]:
template =  "[INST] <<SYS>><</SYS>>\n\n{instruction} [/INST] "
input_test = "Hello, I am a chatbot. What is your name?"
input_test = template.format(instruction=input_test)
input_sample = tokenizer(input_test, return_tensors='pt')
input_ids = input_sample['input_ids'].cuda()
attention_mask = input_sample['attention_mask'].cuda()

In [3]:
# print the layer name for each layer in the llama model
for name, param in model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.mlp.gate_proj.weight
model.layers.2.mlp.up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.inp

In [4]:
for name, module in model.named_modules():
    print(name)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.

In [5]:
from collections import OrderedDict

def attach_hooks(model, kind='mlp'):
    act_dict = OrderedDict()

    def _make_hook(name):
        def _hook(mod, inp, out):
            act_dict[name] = out[0]

        return _hook

    for name, module in model.named_modules():
        if kind == 'mlp':
            if isinstance(module, torch.nn.Linear):
                module.register_forward_hook(_make_hook(name))
        elif kind == 'attn':
            if isinstance(module, torch.nn.MultiheadAttention):
                module.register_forward_hook(_make_hook(name))
        elif kind == 'all':
            module.register_forward_hook(_make_hook(name))
        else:
            raise ValueError('Unknown kind')
    return act_dict

hooked_model = attach_hooks(model, kind='all')
output = model(input_ids, attention_mask=attention_mask)

In [10]:
for name, x in hooked_model.items():
    print(name, x.shape)

model.embed_tokens torch.Size([33, 4096])
model.layers.0.input_layernorm torch.Size([33, 4096])
model.layers.0.self_attn.q_proj torch.Size([33, 4096])
model.layers.0.self_attn.k_proj torch.Size([33, 4096])
model.layers.0.self_attn.v_proj torch.Size([33, 4096])
model.layers.0.self_attn.rotary_emb torch.Size([33, 128])
model.layers.0.self_attn.o_proj torch.Size([33, 4096])
model.layers.0.self_attn torch.Size([1, 33, 4096])
model.layers.0.post_attention_layernorm torch.Size([33, 4096])
model.layers.0.mlp.gate_proj torch.Size([33, 11008])
model.layers.0.mlp.act_fn torch.Size([33, 11008])
model.layers.0.mlp.up_proj torch.Size([33, 11008])
model.layers.0.mlp.down_proj torch.Size([33, 4096])
model.layers.0.mlp torch.Size([33, 4096])
model.layers.0 torch.Size([1, 33, 4096])
model.layers.1.input_layernorm torch.Size([33, 4096])
model.layers.1.self_attn.q_proj torch.Size([33, 4096])
model.layers.1.self_attn.k_proj torch.Size([33, 4096])
model.layers.1.self_attn.v_proj torch.Size([33, 4096])
mode

In [11]:
input_ids.shape

torch.Size([1, 33])

In [7]:
import torch
import transformers

name = 'mosaicml/mpt-7b-chat'

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.attn_config['attn_impl'] = 'triton'
config.init_device = 'cuda:0' # For fast initialization directly on GPU!

model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  config=config,
  torch_dtype=torch.bfloat16, # Load model weights in bfloat16
  trust_remote_code=True
)


TypeError: 'type' object is not subscriptable