## Validate Tokenizer

In [5]:
from megatron.training.tokenizer.tokenizer import _HuggingFaceTokenizer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokenizer_megatron = _HuggingFaceTokenizer("/data/models/Emu3-Gen")
tokenizer_huggingface = AutoTokenizer.from_pretrained("/data/models/Emu3-Gen", trust_remote_code=True)

  vision_tokens = [t.strip() for t in open(special_tokens_file).readlines() if len(t.strip()) > 0]


In [7]:
tokenizer_huggingface.eof_token

'<|extra_201|>'

In [8]:
word_list = ['a portrait of young girl.', 'a portrait of young man.']
assert tokenizer_megatron.tokenize(word_list) == [tokenizer_huggingface.encode(word) for word in word_list]


## Validate Model Forward and Backward

In [25]:
import numpy as np

In [26]:
cache_dir = "/root/Megatron-LM/cache"

def inspect_output(hf_array, megatron_array):
    print(f"hf_array.shape: {hf_array.shape}, megatron_array.shape: {megatron_array.shape}")
    diff = np.abs(hf_array-megatron_array)
    min_diff = diff.min()
    max_diff = diff.max()
    mean_diff = diff.mean()
    print(f"min_diff: {min_diff}, max_diff: {max_diff}, mean_diff: {mean_diff}")



In [27]:
embedding_hf = np.load("/root/Megatron-LM/cache/hf_model.embed_tokens.npy")
embedding_megatron = np.load("/root/Megatron-LM/cache/megatron_embedding.word_embeddings.npy")
embedding_flag = np.allclose(embedding_hf, embedding_megatron, atol=5e-5, rtol=1e-5)
print("embedding: ", embedding_flag)
if not embedding_flag:
    inspect_output(embedding_hf, embedding_megatron)
# embedding dropout
embedding_dropout_hf = np.load("/root/Megatron-LM/cache/hf_model.dropout.npy")
embedding_dropout_megatron = np.load("/root/Megatron-LM/cache/megatron_embedding.embedding_dropout.npy")
dropout_flag = np.allclose(embedding_dropout_hf, embedding_dropout_megatron.transpose(1,0,2), atol=5e-5, rtol=1e-5)
print("embedding dropout: ", dropout_flag)
if not dropout_flag:
    inspect_output(embedding_dropout_hf, embedding_dropout_megatron.transpose(1,0,2))

embedding:  True
embedding dropout:  True


In [28]:
# qkv_proj
qkv_megatron = np.load(f"{cache_dir}/megatron_decoder.layers.0.self_attention.linear_qkv.npy_o0.npy")
q_megatron = qkv_megatron.transpose(1,0,2).reshape(1, 128, 8, -1)[:, :, :, :512].reshape(1, 128, -1)
q_hf = np.load(f"{cache_dir}/hf_model.layers.0.self_attn.q_proj.npy")
layer_norm_flag = np.allclose(q_megatron, q_hf, atol=5e-5, rtol=1e-5)
print("layer_norm: ", layer_norm_flag)
if not layer_norm_flag:
    inspect_output(q_megatron, q_hf)

layer_norm:  True


In [16]:
x = np.load('/root/Megatron-LM/cache/hf_model.layers.0.input_layernorm.npy')

# megatron qkv
megatron_qkv = x@megatron_linear_qkv.T
megatron_q = megatron_qkv.reshape(128, 8, -1)[:, :, :512].reshape(128, -1)
hf_q = x@hf_linear_q.T
inspect_output(megatron_q, hf_q)
inspect_output(megatron_q, q_megatron)

hf_array.shape: (128, 4096), megatron_array.shape: (1, 128, 4096)
min_diff: 0.0, max_diff: 0.0, mean_diff: 0.0
hf_array.shape: (128, 4096), megatron_array.shape: (1, 128, 4096)
min_diff: 0.0, max_diff: 0.00046753883361816406, mean_diff: 1.3898101315135136e-05


In [56]:
inspect_output(q_hf, hf_q)

hf_array.shape: (1, 128, 4096), megatron_array.shape: (1, 128, 4096)
min_diff: 0.0, max_diff: 1.9073486328125e-06, mean_diff: 3.73696664723866e-08


n_out:  [[-0.20171243  0.06703024  0.0255844  ...  0.27597302 -0.20493615
   0.15210561]
 [ 0.15066259 -0.37075084  0.34638226 ...  0.44702706 -0.1128664
  -0.05870222]
 [ 0.00606197 -0.01582064  0.09621283 ... -0.28794834 -0.0145349
   0.14136854]
 ...
 [-0.32960933  0.01034221  0.12024365 ... -0.20597263 -0.02718414
  -0.00516073]
 [-0.29591063  0.10484271  0.207427   ... -0.21518306  0.0686103
   0.46210706]
 [ 0.13179992 -0.09774409  0.18862736 ... -0.11785763 -0.1223011
  -0.5176106 ]]

In [24]:
output_hf = np.load("output_hf.npy")
output_megatron = np.load("output_megatron.npy")
inspect_output(output_hf, output_megatron)

hf_array.shape: (1, 128, 184622), megatron_array.shape: (1, 128, 184622)
min_diff: 0.0, max_diff: 22.800884246826172, mean_diff: 3.6361570358276367


## Validate TE

In [20]:

import torch
import transformer_engine.pytorch as te
import numpy as np

torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True

In [21]:
x = torch.from_numpy(np.load('/root/Megatron-LM/cache/hf_model.layers.0.input_layernorm.npy')).cuda()

te_mlp = te.Linear(4096, 6144, bias=False)
with torch.no_grad():
    te_mlp.weight.copy_(torch.from_numpy(megatron_linear_qkv).cuda())

torch_mlp = torch.nn.Linear(4096, 6144, bias=False, device='cuda')
with torch.no_grad():
    torch_mlp.weight.copy_(torch.from_numpy(megatron_linear_qkv).cuda())

In [23]:
q_te = te_mlp(x).detach().cpu().numpy().reshape(128, 8, -1)[:, :, :512].reshape(128, -1)
q_torch = torch_mlp(x).detach().cpu().numpy().reshape(128, 8, -1)[:, :, :512].reshape(128, -1)
inspect_output(q_te, q_torch)

hf_array.shape: (128, 4096), megatron_array.shape: (128, 4096)
min_diff: 0.0, max_diff: 0.0, mean_diff: 0.0
