In [34]:
r"""
T5Attention模块对齐
"""

'\nT5Attention模块对齐\n'

In [35]:
# select device
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [36]:
import torch, mindspore
import numpy as np
from transformers.models.t5 import modeling_t5 as pt
import mindnlp.models.t5 as m

In [37]:
# init config
ms_config = m.T5Config()
pt_config = pt.T5Config()

# init model
ms_model = m.T5Attention(ms_config, has_relative_attention_bias=True)
pt_model = pt.T5Attention(pt_config, has_relative_attention_bias=True)

T5Attention(
  (q): Linear(in_features=512, out_features=512, bias=False)
  (k): Linear(in_features=512, out_features=512, bias=False)
  (v): Linear(in_features=512, out_features=512, bias=False)
  (o): Linear(in_features=512, out_features=512, bias=False)
  (relative_attention_bias): Embedding(32, 8)
)

In [38]:
# print pt_model parameters' name
pt_params = pt_model.state_dict()
for key in pt_params.keys():
    print(key)

q.weight
k.weight
v.weight
o.weight
relative_attention_bias.weight


In [39]:
# print ms_model parameters' name
for key, _ in ms_model.parameters_and_names():
    print(key)

q.weight
k.weight
v.weight
o.weight
relative_attention_bias.embedding_table


In [40]:
# load parameters
for key, param in ms_model.parameters_and_names():
    if 'embedding_table' in key:
        key = key.replace('embedding_table', 'weight') # different name in two models
    param.set_data(mindspore.Tensor(pt_params.get(key).detach().numpy()))

In [None]:
# set eval mode
ms_model.set_train(False)
pt_model.eval()

In [41]:
# prepare data
x = np.random.randn(4, 64, 512)
ms_x = mindspore.Tensor(x, dtype=mindspore.float32) # dtype depends on model
pt_x = torch.tensor(x, dtype=torch.float32)         # sometimes maybe int not float

In [42]:
# output
ms_out = ms_model(ms_x)
pt_out = pt_model(pt_x)

In [43]:
# shape & loss
assert ms_out[0].shape == pt_out[0].shape
# assert ms_out[1].shape == pt_out[1].shape # NoneType
assert ms_out[2].shape == pt_out[2].shape
assert np.allclose(ms_out[0].asnumpy(), pt_out[0].detach().numpy(), 1e-5, 1e-5)
# assert np.allclose(ms_out[1].asnumpy(), pt_out[1].detach().numpy(), 1e-5, 1e-5) # NoneType
assert np.allclose(ms_out[2].asnumpy(), pt_out[2].detach().numpy(), 1e-5, 1e-5)
print("PASS")

PASS


In [49]:
def judge(o1, o2, loss = 1e-3, prefix = '-'):
    prefix += '-'
    if (isinstance(o1, tuple)):
        for i in range(len(o1)):
            judge(o1[i], o2[i], loss=loss, prefix=prefix)
    elif (isinstance(o1,mindspore.Tensor)):
        print(f"{prefix}{np.allclose(o1.asnumpy(), o2.detach().numpy(), loss, loss)}")
    else:
        print(f"{type(o1)}-{type(o2)}:{o1==o2}")

In [50]:
judge(ms_out, pt_out)

---True
<class 'NoneType'>-<class 'NoneType'>:True
---True
