## 1. 加载预训练模型

In [10]:
from transformers import AutoModelForMaskedLM
# 加载中文bert模型
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# 显示模型配置信息
model.config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [3]:
# 显示模型结构
model.parameters

<bound method Module.parameters of BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

## 2 加载词元化工具

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
sent1 = '我爱机器学习'
sent2 = '我更爱深度学习'
#编码两个句子
encode_result = tokenizer.encode(
    text=sent1,
    text_pair=sent2,

    #当句子长度大于max_length时,截断
    truncation=True,

    #一律补pad到max_length长度
    padding='max_length',
    add_special_tokens=True,
    max_length=15,
    return_tensors=None,
)
print(encode_result)

[101, 2769, 4263, 3322, 1690, 2110, 739, 102, 2769, 3291, 4263, 3918, 2428, 2110, 102]


In [6]:
tokenizer.decode(encode_result)

'[CLS] 我 爱 机 器 学 习 [SEP] 我 更 爱 深 度 学 [SEP]'

In [7]:
#获取字典
mydict = tokenizer.get_vocab()

type(mydict), len(mydict), '强化' in mydict,

(dict, 21128, False)

In [8]:
#添加新词
tokenizer.add_tokens(new_tokens=['强化', '学习'])

#添加新符号
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

mydict = tokenizer.get_vocab()

type(mydict), len(mydict), mydict['强化'], mydict['[EOS]']

(dict, 21131, 21128, 21130)

In [9]:
#编码新添加的词
encode_result = tokenizer.encode(
    text='学习强化学习[EOS]',
    text_pair=None,

    #当句子长度大于max_length时,截断
    truncation=True,

    #一律补pad到max_length长度
    padding='max_length',
    add_special_tokens=True,
    max_length=10,
    return_tensors=None,
)

print(encode_result)

tokenizer.decode(encode_result)

[101, 21129, 21128, 21129, 21130, 102, 0, 0]


'[CLS] 学习 强化 学习 [EOS] [SEP] [PAD] [PAD]'