In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="SparseLLM/ReluLLaMA-7B")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

tokenizer = AutoTokenizer.from_pretrained(
    "SparseLLM/ReluLLaMA-7B", use_fast=False
)
# output hidden state
config = AutoConfig.from_pretrained("SparseLLM/ReluLLaMA-7B", output_hidden_states=True)
model = AutoModelForCausalLM.from_pretrained("SparseLLM/ReluLLaMA-7B", config=config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
from calflops import calculate_flops
flops, macs, params = calculate_flops(
    model=model,
    input_shape=(1,11),
    output_as_string=True,
    output_precision=4,
    transformer_tokenizer=tokenizer
)
print("Alexnet FLOPs:%s   MACs:%s   Params:%s \n" % (flops, macs, params))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  6.74 B  
fwd MACs:                                                               72.6779 GMACs
fwd FLOPs:                                                              145.36 GFLOPS
fwd+bwd MACs:                                                           218.034 GMACs
fwd+bwd FLOPs:                                                          436.079 GFLOPS

-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Each mod

In [2]:
type(model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [3]:
[module for module in model.named_modules()]

[('',
  LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 4096, padding_idx=0)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
            (act_fn): ReLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): LlamaRMSNor

In [4]:
model.get_submodule("model.layers.31.mlp.act_fn")

ReLU()

In [5]:
from torchknickknacks import modelutils
layer = model.get_submodule("model.layers.18.mlp.act_fn")
layer2 = model.get_submodule("model.layers.31.mlp.down_proj")
recorder = modelutils.Recorder(layer, record_output=True, backward=False)
recorder2 = modelutils.Recorder(layer2, record_output=True, backward=False)

In [19]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)

TypeError: ReLU.forward() got an unexpected keyword argument 'attention_mask'

In [7]:
print(recorder.recording)
print(recorder2.recording)
sum(sum(sum(recorder.recording == 0)))

tensor([[[0.0931, 0.0139, 0.0314,  ..., 0.0000, 0.0000, 0.0000],
         [0.1245, 0.0000, 0.0000,  ..., 0.0265, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.2297, 0.0000,  ..., 0.0000, 0.0000, 0.5524],
         [0.0000, 0.0022, 0.0000,  ..., 0.0000, 0.0000, 0.2228],
         [0.1003, 0.0000, 0.0274,  ..., 0.0000, 0.0000, 0.0000]]],
       grad_fn=<ReluBackward0>)
tensor([[[-1.1766,  2.3352,  6.6595,  ...,  1.9904, -1.7416,  3.1122],
         [-0.8362,  1.2037, -0.8024,  ..., -1.0820, -0.5340, -0.5148],
         [ 1.0643, -0.9274, -1.4926,  ..., -0.8088, -1.6298, -1.8066],
         ...,
         [ 4.2380, -4.3821, -1.6420,  ..., -3.2197, -4.1690, -4.2523],
         [ 3.2101, -2.4574,  0.7382,  ..., -2.9415,  0.8972, -2.0524],
         [ 0.8878,  2.2958,  4.5667,  ...,  1.1654, -2.0586,  1.2388]]],
       grad_fn=<UnsafeViewBackward0>)


tensor(74572)

In [16]:
recorder.recording.shape

torch.Size([1, 11, 11008])

In [8]:
output.hidden_states[1]

tensor([[[ 0.0008, -0.0216,  0.0394,  ..., -0.0171, -0.0351,  0.0442],
         [ 0.0371, -0.0065, -0.0127,  ...,  0.0360, -0.0056,  0.0015],
         [ 0.0006, -0.0350,  0.0098,  ..., -0.0288,  0.0186, -0.0051],
         ...,
         [ 0.0021, -0.0060, -0.0308,  ...,  0.0009,  0.0104,  0.0362],
         [ 0.0015,  0.0100,  0.0065,  ..., -0.0196,  0.0180, -0.0051],
         [ 0.0113, -0.0060, -0.0134,  ..., -0.0091,  0.0101, -0.0051]]],
       grad_fn=<AddBackward0>)

In [9]:
output.attentions
# output.pooler_output is not available for 'CausalLMOutputWithPast' object

In [10]:
wrapped_model = model.base_model
wrapped_model.__dict__

{'training': False,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': OrderedDict([('embed_tokens',
               Embedding(32000, 4096, padding_idx=0)),
              ('layers',
               ModuleList(
                 (0-31): 32 x LlamaDecoderLayer(
                   (self_attn): LlamaSdpaAttention(
                     (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
                     (k_proj): Linear(in_features=

In [11]:
for idx, layer in enumerate(wrapped_model.layers):
    if idx == 0:
        print(layer.__dict__)

{'training': False, '_parameters': OrderedDict(), '_buffers': OrderedDict(), '_non_persistent_buffers_set': set(), '_backward_pre_hooks': OrderedDict(), '_backward_hooks': OrderedDict(), '_is_full_backward_hook': None, '_forward_hooks': OrderedDict(), '_forward_hooks_with_kwargs': OrderedDict(), '_forward_hooks_always_called': OrderedDict(), '_forward_pre_hooks': OrderedDict(), '_forward_pre_hooks_with_kwargs': OrderedDict(), '_state_dict_hooks': OrderedDict(), '_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_post_hooks': OrderedDict(), '_modules': OrderedDict([('self_attn', LlamaSdpaAttention(
  (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (rotary_emb): LlamaRotaryEmbedding()
)), ('mlp', LlamaMLP(
  (gate_pro

In [12]:
from peft.utils.other import _get_submodules
parent, target, target_name = _get_submodules(model, "model.layers.31")
from torch.nn import ReLU
setattr(parent, target_name, ReLU())

In [14]:
[module for module in model.named_modules()]

[('',
  LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 4096, padding_idx=0)
      (layers): ModuleList(
        (0-30): 31 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
            (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
            (act_fn): ReLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): LlamaRMSNor

In [15]:
_get_submodules(model, "model.layers.31.mlp")

AttributeError: ReLU has no attribute `mlp`

In [None]:
output = model(**encoded_input)

TypeError: ReLU.forward() got an unexpected keyword argument 'attention_mask'