In [1]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
config_dict = dict(
    cache_dir="cache",
    # force_download=True,
    # resume_download=True,
    proxies={'http': os.environ["HTTP_PROXY"], 'https': os.environ["HTTPS_PROXY"]}
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese", **config_dict)
model = AutoModelWithLMHead.from_pretrained("bert-base-chinese", **config_dict)

sequence = f"生活的真谛是{tokenizer.mask_token}。"

input = tokenizer.encode(sequence, return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
mask_token_index

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([7])

In [57]:
def print_token(x):
    return f"{tokenizer.decode([x])}[{x}]"

In [66]:
sequence = f"生活的真谛是美。"
from pprint import pprint
input = tokenizer.encode(sequence, return_tensors="pt")
for mask_token_index, real_token in enumerate(input[0]):
    masked_input = input.clone()
    masked_input[0, mask_token_index] = tokenizer.mask_token_id
    print(tokenizer.decode(masked_input[0]))
    token_logits = model(masked_input).logits
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, 5).indices.tolist()
    pprint([(print_token(x),float(mask_token_logits[x])) for x in [real_token] + top_5_tokens])

[MASK] 生 活 的 真 谛 是 美 。 [SEP]
[('[CLS][101]', -4.34677267074585),
 ('，[8024]', 5.548994064331055),
 ('。[511]', 5.360104560852051),
 ('的[4638]', 5.346104621887207),
 ('人[782]', 4.416329383850098),
 ('了[749]', 4.279936790466309)]
[CLS] [MASK] 活 的 真 谛 是 美 。 [SEP]
[('生[4495]', 19.903968811035156),
 ('生[4495]', 19.903968811035156),
 ('人[782]', 12.397592544555664),
 ('鲜[7831]', 10.195749282836914),
 ('美[5401]', 9.150812149047852),
 ('快[2571]', 9.078143119812012)]
[CLS] 生 [MASK] 的 真 谛 是 美 。 [SEP]
[('活[3833]', 19.629314422607422),
 ('活[3833]', 19.629314422607422),
 ('命[1462]', 19.119295120239258),
 ('存[2100]', 12.679109573364258),
 ('生[4495]', 11.334877014160156),
 ('物[4289]', 11.22317123413086)]
[CLS] 生 活 [MASK] 真 谛 是 美 。 [SEP]
[('的[4638]', 18.556386947631836),
 ('的[4638]', 18.556386947631836),
 ('之[722]', 13.935623168945312),
 ('，[8024]', 13.344372749328613),
 ('最[3297]', 10.701181411743164),
 ('其[1071]', 9.818900108337402)]
[CLS] 生 活 的 [MASK] 谛 是 美 。 [SEP]
[('真[4696]', 21.43470001220703),
 (

In [64]:
from mylist import MyList

103

In [34]:
mask_token_index

tensor([7])

In [7]:
model(input, output_hidden_states=True).hidden_states

(tensor([[[ 0.0588,  0.0704, -0.2139,  ..., -0.0237, -0.2234, -0.1116],
          [-0.2540,  0.1951, -0.9774,  ..., -1.0265, -0.0109,  1.4410],
          [-0.4060, -0.2025,  0.6814,  ..., -0.6053,  0.9638,  0.6932],
          ...,
          [ 0.1026,  0.4093,  0.0582,  ..., -0.5345, -0.1433,  0.1866],
          [ 0.1287,  0.6613,  0.3791,  ..., -0.8779, -0.4095,  1.0400],
          [ 0.1603,  0.2365,  0.2230,  ..., -0.7591, -0.5075,  0.5007]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[[ 0.0819,  0.0069, -0.2078,  ...,  0.0875, -0.0651, -0.0418],
          [-0.0742, -0.5346, -0.8241,  ..., -1.2033, -0.1132,  0.8679],
          [ 0.3037,  0.0661,  0.8634,  ..., -0.3258,  1.1006,  0.4839],
          ...,
          [-0.0783,  0.0824, -0.3625,  ..., -0.6974,  0.1537,  0.4883],
          [ 0.0236,  0.4476, -0.1046,  ..., -0.7037, -0.1806,  0.9855],
          [ 0.0296,  0.3524,  0.1097,  ..., -0.9359, -0.3817,  0.5382]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[[-5.

In [12]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [13]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_batch_encode_plus',
 '_bos_token',
 '_cls_token',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_decode',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_from_pretrained',
 '_get_padding_truncation_strategies',
 '_mask_token',
 '_pad',
 '_pad_token',
 '_pad_token_type_id',
 '_save_pretrained',
 '_sep_token',
 '_tokenizer',
 '_unk_token',
 'add_special_tokens',
 'add_tokens',
 'additional_special_tokens',
 'additional_special_tok

In [14]:
tokenizer.convert_ids_to_tokens([1,2,3])

['[unused1]', '[unused2]', '[unused3]']

In [20]:
tokenizer.voca*?

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


tokenizer.vocab
tokenizer.vocab_files_names
tokenizer.vocab_size

In [35]:
import pandas as pd

r2

1235       勇
2359       帆
5169       細
7187       铀
14316    ##包
        ... 
14903    ##塊
18099    ##简
7802       鮪
6470       谤
16773    ##氲
Length: 21128, dtype: object