## 1.检查LlamaTransformer.from_pretrained方法是否能正确加载模型权重到自定义的backbone
### 它的推理结果理论上跟AutoModelForCausalLM的差不多

In [1]:
import torch
from transformers import AutoModelForCausalLM
from utils.nn_toolkit import set_seed, auto_device

In [2]:
set_seed(12345)
device = auto_device()

model_path = 'D:/PycharmProjects/llama3_proj/models/Meta-Llama-3-8B'

# 构造伪输入
test_input = torch.randint(0, 10000, (1, 4)).to(device)

In [3]:
# 原始model
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
print(model)

# forward并获取最后一个隐层的值, 即im_head的前一层
model.eval()
with torch.no_grad():
    res = model(test_input, output_hidden_states=True)

print(res.logits)
# print(res.hidden_states[-1][0])

del model
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [4]:
from model.llama import LlamaTransformer
model = LlamaTransformer.from_pretrained("llama-3-8B", model_path, torch.bfloat16).to(device)

model.eval()
with torch.no_grad():
    logits, _ = model(test_input)
print(logits)

del model
torch.cuda.empty_cache()

loading weights from pretrained model: meta-llama/Meta-Llama-3-8B => {'n_layers': 32, 'n_heads': 32, 'dim': 4096, 'n_kv_heads': 8, 'vocab_size': 128256, 'max_seq_len': 2048}
parameters num: 291


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 291/291 [00:00<00:00, 375.28it/s]


tensor([[[ 3.7031,  6.3750,  3.3750,  ..., -7.1562, -7.1562, -7.1562]]],
       device='cuda:0', dtype=torch.bfloat16)


## 2.使用hellaswag数据集来测试LlamaTransformer模型结构以及LlamaTransformer.from_pretrained方法的正确性

In [1]:
from utils.hellaswag import render_example, iterate_examples
import torch
from torch.nn import functional as F

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
val = list(enumerate(iterate_examples("val")))
len(val)

10042

In [3]:
def get_most_likely_row(tokens, mask, logits):
    # evaluate the autoregressive loss at all positions
    shift_logits = (logits[..., :-1, :]).contiguous()
    shift_tokens = (tokens[..., 1:]).contiguous()
    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    flat_shift_tokens = shift_tokens.view(-1)
    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
    shift_losses = shift_losses.view(tokens.size(0), -1)
    # now get the average loss just for the completion region (where mask == 1), in each row
    shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
    masked_shift_losses = shift_losses * shift_mask
    # sum and divide by the number of 1s in the mask
    sum_loss = masked_shift_losses.sum(dim=1)
    avg_loss = sum_loss / shift_mask.sum(dim=1)
    # now we have a loss for each of the 4 completions
    # the one with the lowest loss should be the most likely
    pred_norm = avg_loss.argmin().item()
    return pred_norm

In [4]:
from model.llama import LlamaTransformer
from transformers import AutoModelForCausalLM

local_model = "D:/PycharmProjects/llama3_proj/models/Meta-Llama-3-8B"
# llama3 = AutoModelForCausalLM.from_pretrained(local_model, torch_dtype=torch.bfloat16).to('cuda')
llama3 = LlamaTransformer.from_pretrained("llama-3-8B", local_model, torch_type=torch.bfloat16).to('cuda')

loading weights from pretrained model: meta-llama/Meta-Llama-3-8B => {'n_layers': 32, 'n_heads': 32, 'dim': 4096, 'n_kv_heads': 8, 'vocab_size': 128256, 'max_seq_len': 2048}
parameters num: 291


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 291/291 [00:00<00:00, 333.04it/s]


In [5]:
from tqdm import tqdm

num_total = 0
num_correct_norm = 0

llama3.eval()

# 创建一个进度条
for i, example in tqdm(val, desc="Processing", unit="example"):
    # only process examples where i % ddp_world_size == ddp_rank
    # render the example into tokens and labels
    _, tokens, mask, label = render_example(example)
    tokens = tokens.to("cuda")
    mask = mask.to("cuda")
    # get the logits
    with torch.no_grad():
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            # logits = llama3(tokens).logits  # HF-llama3 => 官方Llama3评测 HellaSwag accuracy: 7037/10042=0.7008
            logits, _ = llama3(tokens, fast_inference=False)  # 自定义llama3评测 HellaSwag accuracy: ?
        pred_norm = get_most_likely_row(tokens, mask, logits)
    num_total += 1
    num_correct_norm += int(pred_norm == label)
acc_norm = num_correct_norm / num_total
print(f"HellaSwag accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")

del llama3
torch.cuda.empty_cache()

Processing: 100%|██████████| 10042/10042 [10:45<00:00, 15.55example/s]


HellaSwag accuracy: 7035/10042=0.7006
