In [1]:
r"""
T5Model预训练参数加载对齐_2
"""

'\nT5Model预训练参数加载对齐_2\n'

In [2]:
# select device
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import torch, mindspore, json
import numpy as np
import transformers.models.t5 as pt
import mindnlp.models.t5 as m

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# files path
path = {
    't5-small':'',
    't5-base':'',
    't5-large':'',
    't5-3b':'',
    't5-11b':''
}

In [5]:
size = 't5-3b'

In [6]:
config_path = f"{path[size]}/{size}_config.json"
with open(config_path, encoding='utf-8') as config:
    config = json.load(config)
# init config
pt_config = pt.T5Config(**config)
ms_config = m.T5Config(**config)

In [7]:
# init model
pt_model = pt.T5Model(pt_config)
ms_model = m.T5Model(ms_config)

# load parameters
pt_dict = torch.load(f"{path[size]}/pytorch_model.bin")
pt_model.load_state_dict(pt_dict, False) 

ms_dict = mindspore.load_checkpoint(f"{path[size]}/{size}_model.ckpt")
param_not_load = mindspore.load_param_into_net(ms_model, ms_dict)
print(f"Param_not_load:{param_not_load}")

# set eval mode
pt_model.eval()
ms_model.set_train(False)

Param_not_load:([], ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.embedding_table'])


T5Model<
  (shared): Embedding<vocab_size=32128, embedding_size=1024, use_one_hot=False, embedding_table=Parameter (name=decoder.embed_tokens.embedding_table, shape=(32128, 1024), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
  (encoder): T5Stack<
    (embed_tokens): Embedding<vocab_size=32128, embedding_size=1024, use_one_hot=False, embedding_table=Parameter (name=decoder.embed_tokens.embedding_table, shape=(32128, 1024), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
    (block): CellList<
      (0): T5Block<
        (layer): CellList<
          (0): T5LayerSelfAttention<
            (SelfAttention): T5Attention<
              (q): Dense<input_channels=1024, output_channels=4096>
              (k): Dense<input_channels=1024, output_channels=4096>
              (v): Dense<input_channels=1024, output_channels=4096>
              (o): Dense<input_channels=4096, output_channels=1024>
              (relative_attention_bias): Embedding<vocab_siz

In [8]:
# tokenizer
tokenizer = pt.T5Tokenizer.from_pretrained(size)

# prepare data
input_ids = "translate English to German: With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task."
decoder_input_ids = [[np.random.randint(0,1000)]]

pt_input_ids = tokenizer([input_ids], return_tensors="pt").input_ids
pt_decoder_input_ids = torch.tensor(decoder_input_ids, dtype=torch.long)
ms_input_ids = mindspore.Tensor(pt_input_ids.detach().numpy()).to(mindspore.int64)
ms_decoder_input_ids = mindspore.Tensor(pt_decoder_input_ids.detach().numpy()).to(mindspore.int64)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-3b automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
# output
pt_out = pt_model(input_ids=pt_input_ids, decoder_input_ids=pt_decoder_input_ids)
ms_out = ms_model(input_ids=ms_input_ids, decoder_input_ids=ms_decoder_input_ids)

In [10]:
# shape & loss
assert ms_out[0].shape == pt_out[0].shape
assert np.allclose(ms_out[0].asnumpy(), pt_out[0].detach().numpy(), 5e-3, 5e-3)
for i in range(len(ms_out[1])):
    for j in range(len(ms_out[1][i])):
        assert ms_out[1][i][j].shape == pt_out[1][i][j].shape
        assert np.allclose(ms_out[1][i][j].asnumpy(), pt_out[1][i][j].detach().numpy(), 5e-3, 5e-3)
assert ms_out[2].shape == pt_out[2].shape
assert np.allclose(ms_out[2].asnumpy(), pt_out[2].detach().numpy(), 5e-3, 5e-3)
print("PASS")

PASS
