In [1]:
# pip install transformers
# pip install thop
# pip install torch

from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
# 定义模型和 tokenizer 的路径
model_directory = r"Z:/llmfile/Meta-Llama-3.1-8B-Instruct/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f"
# model_directory = r"Z:/llmfile/Llama-2-7B-hf/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"

# 加载模型和 tokenizer
model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# 打印模型结构
print(model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [3]:
# 计算模型参数数量
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 8030261248
Trainable parameters: 8030261248


In [10]:
from thop import profile
import torch

# 定义输入的形状（虚拟输入，确保类型为 LongTensor）
input_shape = (1, 512)  # batch size 1, sequence length 2048
long_input = torch.ones(input_shape, dtype=torch.long)  # 确保输入为 LongTensor

# 使用 thop 来计算 FLOPs 和参数数量
macs, params = profile(model, inputs=(long_input,))

# 打印 FLOPs 和参数信息
print(f"FLOPs: {macs}")
print(f"Parameters: {params}")

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
FLOPs: 13531294466048.0
Parameters: 6607077376.0


In [17]:
# 加载模型
model_directory = r"Z:/llmfile/Llama-2-7B-hf/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
model = AutoModelForCausalLM.from_pretrained(model_directory)

# 定义输入张量
input_shape = (1, 512)  # batch size 1, sequence length 2048
dummy_input = torch.ones(input_shape, dtype=torch.long)

# 前向传播 FLOPs 计算
macs, params = profile(model, inputs=(dummy_input,))
print(f"Inference FLOPs (per forward pass): {macs / 1e9} GFLOPs")

# 假设反向传播 FLOPs 是前向传播的两倍
training_flops_per_batch = 3 * macs  # 1 次前向传播 + 2 次反向传播
print(f"Training FLOPs (per batch): {training_flops_per_batch / 1e12} TFLOPs")

# 假设有 n_batches 个 batch 训练
n_batches = 32  # 例如 10000 个 batch
total_training_flops = training_flops_per_batch * n_batches
print(f"Total Training FLOPs: {total_training_flops / 1e12} TFLOPs")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
Inference FLOPs (per forward pass): 3382.823616512 GFLOPs
Training FLOPs (per batch): 10.148470849536 TFLOPs
Total Training FLOPs: 324.751067185152 TFLOPs
