In [1]:
import tensorflow as tf
from datasets import load_dataset

# 数据加载

In [3]:
dataset = load_dataset(path="seamew/ChnSentiCorp")

Using custom data configuration default
Reusing dataset chn_senti_corp (/Users/luominzhi/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


  0%|          | 0/3 [00:00<?, ?it/s]

# 加载分词器

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# 获取 "[MASK]" token 的 id

In [5]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [6]:
tokenizer.convert_tokens_to_ids("[MASK]")

103

# 构建 dataset

In [7]:
from copy import copy
from transformers import DefaultDataCollator


mask_id = tokenizer.convert_tokens_to_ids("[MASK]")

def collate_fn(sample):
    tokenized_dict = tokenizer(sample["text"], padding="max_length", truncation=True, max_length=200)
    input_ids = tokenized_dict["input_ids"]
    attention_mask = tokenized_dict["attention_mask"]
    token_type_ids = tokenized_dict["token_type_ids"]

    # 固定将第15个词固定替换为mask
    labels = copy(input_ids[15])
    input_ids[15] = mask_id

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "mask_label": labels
    }

In [8]:
tokenized_ds = dataset.map(collate_fn)

  0%|          | 0/9600 [00:00<?, ?ex/s]

  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/1200 [00:00<?, ?ex/s]

In [9]:
collate_func = DefaultDataCollator()

tf_train_ds = tokenized_ds["train"].shuffle(seed=42).to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids", "mask_label"],
    shuffle=True,
    collate_fn=collate_func,
    batch_size=8,
)

tf_test_ds = tokenized_ds["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids", "mask_label"],
    shuffle=False,
    collate_fn=collate_func,
    batch_size=8,
)

Metal device set to: Apple M1 Max


  batch[k] = torch.tensor([f[k] for f in features])
2022-08-29 21:34:56.157204: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-29 21:34:56.157557: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
def transform(batch_samples):
    labels = batch_samples.pop("mask_label")
    return batch_samples, labels

In [11]:
tf_train_ds = tf_train_ds.map(transform)
tf_test_ds = tf_test_ds.map(transform)

# 加载预训练模型

In [21]:
from transformers import TFBertModel
pretrained = TFBertModel.from_pretrained("bert-base-chinese")

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [22]:
pretrained(
    input_ids=next(iter(tf_train_ds))[0]["input_ids"],
    attention_mask=next(iter(tf_train_ds))[0]["attention_mask"],
    token_type_ids=next(iter(tf_train_ds))[0]["token_type_ids"]
)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(8, 200, 768), dtype=float32, numpy=
array([[[-0.44781572,  0.82893956, -0.26868275, ..., -0.0603919 ,
         -0.6994644 , -0.34850925],
        [-0.01748601,  0.561159  ,  0.21528283, ..., -1.1898596 ,
         -0.3126536 ,  0.22143382],
        [ 0.73083484,  0.04203127, -0.25058508, ...,  0.0770507 ,
          0.82417315, -0.13127358],
        ...,
        [ 0.36105156,  0.97246516, -0.37264735, ...,  0.8098067 ,
         -0.4026322 ,  0.3612998 ],
        [ 0.3319987 ,  0.94837236, -0.40873444, ...,  0.8073497 ,
         -0.36991626,  0.39124346],
        [ 0.35441408,  0.91659343, -0.43206602, ...,  0.7997105 ,
         -0.35436577,  0.33308154]],

       [[-0.51664114,  0.6124891 , -0.06172222, ..., -0.3142174 ,
         -0.20757127, -0.2774792 ],
        [ 0.27634692,  0.26382828,  0.452016  , ..., -0.59215426,
          0.08139569, -0.22116199],
        [-0.61892104, -0.9218377 ,  0.59397715, .

# 构建下游任务

In [44]:
class Model(tf.keras.Model):
    def __init__(self, *args, **kwargs):
        super(Model, self).__init__(*args, **kwargs)
        self.pretrained = pretrained = TFBertModel.from_pretrained("bert-base-chinese")
        self.dense = tf.keras.layers.Dense(tokenizer.vocab_size)

    def call(self, inputs, training=None, mask=None):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        x = self.pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        x = self.dense(x.last_hidden_state[:, 15])

        return x

model = Model()
# model(next(iter(tf_train_ds))[0])

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [45]:
model(next(iter(tf_train_ds))[0])

<tf.Tensor: shape=(8, 21128), dtype=float32, numpy=
array([[ 0.08372209, -0.28340688, -0.01711703, ...,  0.06099325,
         0.11833639, -0.2591524 ],
       [ 0.15515658, -0.3064114 ,  0.13029285, ...,  0.04500456,
         0.2360949 , -0.0155678 ],
       [ 0.06741177, -0.15236981,  0.39543068, ...,  0.11593656,
         0.26949406,  0.09125978],
       ...,
       [ 0.02691518, -0.16973808,  0.07321474, ...,  0.17139563,
         0.10869642, -0.2228465 ],
       [ 0.16064548, -0.17414498,  0.00160121, ...,  0.3750874 ,
         0.2034133 , -0.22468168],
       [-0.1425188 , -0.19816181,  0.03535921, ...,  0.16553333,
         0.11308062, -0.01194772]], dtype=float32)>

In [46]:
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_bert_model_5 (TFBertMode  multiple                 102267648 
 l)                                                              
                                                                 
 dense_9 (Dense)             multiple                  16247432  
                                                                 
Total params: 118,515,080
Trainable params: 118,515,080
Non-trainable params: 0
_________________________________________________________________


# 训练模型

In [47]:
model = Model()

model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

model.fit(
    tf_train_ds,
    epochs=4,
    validation_data=tf_test_ds
)

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/4


2022-08-29 22:02:39.772418: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-08-29 22:10:46.031039: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
 124/1200 [==>...........................] - ETA: 6:59 - loss: 6.1118 - accuracy: 0.0655

KeyboardInterrupt: 

In [53]:
x_test = next(iter(tf_test_ds))[0]
y_test = next(iter(tf_test_ds))[1]

x_test, y_test

({'labels': <tf.Tensor: shape=(8,), dtype=float32, numpy=array([1., 0., 0., 1., 1., 1., 0., 1.], dtype=float32)>,
  'input_ids': <tf.Tensor: shape=(8, 200), dtype=int64, numpy=
  array([[ 101, 6821,  702, ...,    0,    0,    0],
         [ 101, 2577, 4708, ...,    0,    0,    0],
         [ 101, 6820, 4924, ...,    0,    0,    0],
         ...,
         [ 101, 3300,  749, ...,    0,    0,    0],
         [ 101, 1184, 1378, ...,    0,    0,    0],
         [ 101,  122,  119, ...,    0,    0,    0]])>,
  'attention_mask': <tf.Tensor: shape=(8, 200), dtype=int64, numpy=
  array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]])>,
  'token_type_ids': <tf.Tensor: shape=(8, 200), dtype=int64, numpy=
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..

In [59]:
tokenizer.decode(x_test["input_ids"][0])

'[CLS] 这 个 宾 馆 比 较 陈 旧 了 ， 特 价 的 房 [MASK] 也 很 一 般 。 总 体 来 说 一 般 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [69]:
tf.argmax(model(inputs=x_test), axis=1)

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([8024, 8024, 8024, 8024, 8024, 8024, 8024, 8024])>

In [70]:
tokenizer.decode(8024)

'，'