In [1]:
# 安装模块
!pip install tensorflow==2.2.0 bert4keras

Collecting tensorflow==2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/1a/0d79814736cfecc825ab8094b39648cc9c46af7af1bae839928acb73b4dd/tensorflow-2.2.0-cp37-cp37m-manylinux2010_x86_64.whl (516.2MB)
[K     |████████████████████████████████| 516.2MB 33kB/s 
[?25hCollecting bert4keras
[?25l  Downloading https://files.pythonhosted.org/packages/58/39/4cbf03e4cc7ab87beba6a092ce830e297cd0f60bf2c5099ebf964a3b25db/bert4keras-0.10.1.tar.gz (46kB)
[K     |████████████████████████████████| 51kB 7.6MB/s 
Collecting tensorflow-estimator<2.3.0,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f5/926ae53d6a226ec0fda5208e0e581cffed895ccc89e36ba76a8e60895b78/tensorflow_estimator-2.2.0-py2.py3-none-any.whl (454kB)
[K     |████████████████████████████████| 460kB 47.4MB/s 
Collecting tensorboard<2.3.0,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/1d/74/0a6fcb206dcc72a6da9a62dd81784bfdbff5fedb099982861dc2219014fb/tensorboard-2.2.2-py3-none

## 读取训练数据
数据存放在个人drive硬盘中。需要加载drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from typing import List, Dict
from tqdm import tqdm
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import math

train_data_path = '/content/drive/MyDrive/icwb2-data/training/msr_training.utf8'


def read_data_from_file(filename: str, encoding='utf-8') -> List[List[str]]:
    """
    原始数据格式如下：
    “  这  首先  是  个  民族  问题  ，  民族  的  感情  问题  。
    ’  我  扔  了  两颗  手榴弹  ，  他  一下子  出  溜  下去  。
    “  废除  先前  存在  的  所有制  关系  ，  并不是  共产主义  所  独具  的  特征  。
    读取一行，再对每一行进行空格分割处理，最终返回如下
    [
        ['这','首先',...],
        ['我','扔',...],
        ...
    ]
    """
    res = []
    with open(filename, 'r', encoding=encoding) as fd:
        for line in fd.readlines():
            line = line.strip().split(' ')
            res.append([w for w in line if w])
    return res

## 创建Tokenizer
直接对数据进行编码。包装在List中，方便后续采用batch调用。

标签采用4标签原则，分别是"B、M、E、S"

In [4]:
class Tokenizer:
    def __init__(self, data: List[List[str]], tag2id: Dict[str, str]):
        self.src_data = data
        self.tag2id = tag2id
        self.char_data: List[List[int]] = []
        self.label: List[List[str]] = []
        self.token2id: Dict[str, int] = {}

    def tokenize(self):
        for sentence in tqdm(self.src_data):
            sub_label: List[str] = []
            sub_sent: List[str] = []
            for token in sentence:
                if len(token) == 1:
                    sub_sent.append(token)
                    sub_label.append(self.tag2id.get('S'))
                elif len(token) == 2:
                    sub_sent.extend(list(token))
                    sub_label.extend([self.tag2id.get('B'), self.tag2id.get('E')])
                else:
                    sub_sent.extend(list(token))
                    sub_label.extend([self.tag2id.get('B')] +
                                     [self.tag2id.get('M')] * (len(token) - 2) +
                                     [self.tag2id.get('E')])

            sub_sent_ = self._token2id(sub_sent)

            assert len(sub_sent_) == len(sub_label)
            self.char_data.append(sub_sent_)
            self.label.append(sub_label)

    def _token2id(self, sentence: List[str]) -> List[int]:
        res = []
        for w in sentence:
            if w not in self.token2id:
                self.token2id[w] = len(self.token2id) + 1
            res.append(self.token2id.get(w))
        return res

    @property
    def id2token(self):
        return {v: k for k, v in self.token2id.items()}


In [7]:
class DataLoader(Sequence):
    def __init__(self, data, target, batch_size=64):
        self.data = data
        self.target = target
        self.batch_size = batch_size

    def __getitem__(self, index):
        batch_x = self.data[index * self.batch_size:(index + 1) * self.batch_size]
        batch_y = self.target[index * self.batch_size:(index + 1) * self.batch_size]
        return pad_sequences(batch_x, padding='post'), to_categorical(pad_sequences(batch_y, padding='post', value=4), num_classes=5)

    def __len__(self):
        return math.ceil(len(self.data) / self.batch_size)

In [8]:
pos_tag = list('BMES')

tag2id = {item: i for i, item in enumerate(pos_tag)}
id2tag = {v: k for k, v in tag2id.items()}
print(tag2id)

{'B': 0, 'M': 1, 'E': 2, 'S': 3}


In [9]:
train_data = read_data_from_file(train_data_path)
tokenizer = Tokenizer(train_data, tag2id=tag2id)
tokenizer.tokenize()


for idx, batch in enumerate(DataLoader(tokenizer.char_data, tokenizer.label)):
    x, y = batch
    print(x.shape)
    print(y.shape)

    if idx == 10:
        break

100%|██████████| 86924/86924 [00:03<00:00, 28645.46it/s]


(64, 78)
(64, 78, 5)
(64, 77)
(64, 77, 5)
(64, 84)
(64, 84, 5)
(64, 88)
(64, 88, 5)
(64, 85)
(64, 85, 5)
(64, 93)
(64, 93, 5)
(64, 112)
(64, 112, 5)
(64, 73)
(64, 73, 5)
(64, 70)
(64, 70, 5)
(64, 87)
(64, 87, 5)
(64, 58)
(64, 58, 5)


In [10]:
import os
# 使用bert4keras模块中的CRF和维特比解码
# 需要设置后端，这里使用TF后端

os.environ['TF_KERAS'] = '1'

from bert4keras.layers import ConditionalRandomField
from bert4keras.snippets import ViterbiDecoder

## 构建模型
这里使用简单的CNN+CRF网络。具体参考[苏剑林的案例](https://github.com/bojone/crf/blob/master/word_seg.py)。

In [11]:
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras import layers as L

CRF = ConditionalRandomField()

vocab_size = len(tokenizer.token2id)

cnn = Sequential([
        L.Embedding(vocab_size + 1, 100),
        L.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'),
        L.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'),
        L.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'),
        L.Dense(5, activation='softmax'),
        # ConditionalRandomField()
    ], name='cnn')
cnn.summary()
x_in = Input(shape=(None,))
x = cnn(x_in)
out = CRF(x)
model = Model(x_in, out)

model.summary()

Model: "cnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         516800    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 256)         77056     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 256)         196864    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 256)         196864    
_________________________________________________________________
dense (Dense)                (None, None, 5)           1285      
Total params: 988,869
Trainable params: 988,869
Non-trainable params: 0
_________________________________________________________________
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape 

In [26]:
from typing import Union

class CWS(ViterbiDecoder):
    def parse_tag(self, text: Union[str, List[str]]):
        """text 是char级别的list或者str"""
        encode_text = [tokenizer.token2id.get(w) for w in text]
        nodes = model.predict([encode_text])[0]
        labels = self.decode(nodes=nodes[:, :-1])
        tags = [id2tag.get(i) for i in labels]
        return tags

    def cut(self, text):
        tags = self.parse_tag(text)
        assert len(tags) == len(text)
        ans = ''
        for word, tag in zip(text, tags):
            ans += word
            if tag in ['S', 'E']:
                yield ans
                ans = ''


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tokenizer.char_data, tokenizer.label, test_size=0.02)
print(len(x_train), len(x_test))

85185 1739


In [14]:
model.compile(loss=CRF.dense_loss, metrics=[CRF.dense_accuracy], optimizer='Adam')

## 测试一条数据
由于模型还未开始训练，因此输出的只是测试结果。
创建维特比解码，模型没有训练，转移矩阵是一个随机数，解码肯定存在问题。

In [28]:
ws = CWS(model.get_weights()[-1][:-1, :-1], starts=[0,3], ends=[2,3])
"/".join(ws.cut('不知道这个玩意到底怎么样?'))

'不/知道/这个/玩意/到底/怎么/样?'

## 创建模型回调
模型的测试数据在每个epoch之后做一次评估。评估结果使用维特比解码。

In [57]:
from tensorflow.keras.callbacks import Callback
import numpy as np

class Evaluator(Callback):
    def on_epoch_end(self, epoch, logs=None):
        right = 0
        total = 0
        for sentence, tags in tqdm(zip(x_test, y_test)):
            if len(sentence) == 0:
                continue
            y_pred = model.predict([sentence])[0] # [src, num_classes]
            y_pred = ws.decode(y_pred[:, :-1])
            
            right += (y_pred == tags).sum()
            total += len(tags)
        print()
        print("accuracy: ", right / total)
        self.just_show()

    @staticmethod
    def just_show():
        print("/".join(ws.cut('我是中国人，我热爱中国！')))
        print("/".join(ws.cut('陕西师范大学位于陕西省西安市')))
        print("/".join(ws.cut('最近凑着热闹玩了玩全球人工智能技术创新大赛')))

In [59]:
model.fit(DataLoader(x_train, y_train), 
          epochs=5, 
          callbacks=[Evaluator()])

Epoch 1/5

1739it [00:48, 35.60it/s]


accuracy:  0.9501109193985704
我/是/中国人/，/我/热爱/中国/！
陕西师范大学/位于/陕西省/西安市
最近/凑/着/热闹/玩/了/玩/全球/人工/智能/技术/创新/大赛





Epoch 2/5

1739it [00:48, 35.66it/s]



accuracy:  0.9506655163914223
我/是/中国人/，/我/热爱/中国/！
陕西师范大学/位于/陕西省/西安市
最近/凑/着/热闹/玩/了/玩/全球/人工/智能/技术/创新/大赛
Epoch 3/5

1739it [00:48, 35.86it/s]


accuracy:  0.9531180675375893
我/是/中国人/，/我/热爱/中国/！
陕西师范大学/位于/陕西省/西安市
最近/凑/着/热闹/玩/了/玩/全球/人工/智能/技术/创新/大赛





Epoch 4/5

1739it [00:48, 35.68it/s]



accuracy:  0.9540916933694849
我/是/中国人/，/我/热爱/中国/！
陕西师范大学/位于/陕西省/西安市
最近/凑/着/热闹/玩/了/玩/全球/人工/智能/技术/创新/大赛
Epoch 5/5

1739it [00:49, 35.28it/s]


accuracy:  0.9550776435789993
我/是/中国人/，/我/热爱/中国/！
陕西师范大学/位于/陕西省/西安市
最近/凑/着/热闹/玩/了/玩/全球/人工/智能/技术/创新/大赛





<tensorflow.python.keras.callbacks.History at 0x7f9f3933a350>

In [62]:
model.save('/content/drive/MyDrive/data/icwb2-data/cnn-cws')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/data/icwb2-data/cnn-cws/assets
