# 对联模型
对联数据集采用https://github.com/wb14123/couplet-dataset

模型使用albert的tiny版本进行预训练，UniLM实现seq2seq的模型

代码参考苏剑林的博客，并使用苏剑林开源的bert4keras框架

## 加载训练数据和测试数据

In [1]:
train_file_path_in = 'couplet/train/in.txt'
train_file_path_out = 'couplet/train/out.txt'

test_file_path_in = 'couplet/test/in.txt'
test_file_path_out = 'couplet/test/out.txt'

def load_data(filename):
    with open(filename) as fd:
        return fd.read().split('\n')

In [2]:
train_in = load_data(train_file_path_in)
train_out = load_data(train_file_path_out)
test_in = load_data(test_file_path_in)
test_out = load_data(test_file_path_out)

In [3]:
train_in[:10]

['晚 风 摇 树 树 还 挺 ',
 '愿 景 天 成 无 墨 迹 ',
 '丹 枫 江 冷 人 初 去 ',
 '忽 忽 几 晨 昏 ， 离 别 间 之 ， 疾 病 间 之 ， 不 及 终 年 同 静 好 ',
 '闲 来 野 钓 人 稀 处 ',
 '毋 人 负 我 ， 毋 我 负 人 ， 柳 下 虽 和 有 介 称 ， 先 生 字 此 ， 可 以 谥 此 ',
 '投 石 向 天 跟 命 斗 ',
 '深 院 落 滕 花 ， 石 不 点 头 龙 不 语 ',
 '不 畏 鸿 门 传 汉 祚 ',
 '新 居 落 成 创 业 始 ']

## 加载Albert的Tokenizer和model

In [4]:
import os

# bert_path = 'bert_models/albert_tiny_google_zh_489k/'
bert_path = 'bert_models/albert_base_google_zh_additional_36k_steps'

config_path = os.path.join(bert_path, 'albert_config.json')
checkpoint_path = os.path.join(bert_path, 'albert_model.ckpt')
dict_path = os.path.join(bert_path, 'vocab.txt')

In [5]:
# 设置后端的tf.keras
# os.environ['TF_KERAS'] = '1'

from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.models import build_transformer_model

Using TensorFlow backend.


In [6]:
# TODO: 对于对联项目来说，只需要挑出中文即可，这里作为后续优化
token_dict = load_vocab(dict_path=dict_path)
tokenizer = Tokenizer(token_dict=token_dict)

In [7]:
tokenizer.encode(train_in[0], train_out[0], maxlen=50)

([101,
  3241,
  7599,
  3031,
  3409,
  3409,
  6820,
  2923,
  102,
  3247,
  7463,
  3883,
  5709,
  5709,
  3291,
  5273,
  102],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [8]:
# 数据进行编码
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder, sequence_padding

In [9]:
class CoupletDataGenerator(DataGenerator):
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        for is_end, sample in self.sample(random):
            token_id, segment_id = tokenizer.encode(sample[0], sample[1], maxlen=50)
            batch_token_ids.append(token_id)
            batch_segment_ids.append(segment_id)
            
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []

In [10]:
# loss层
from bert4keras.layers import Loss
from bert4keras.backend import K

# class CrossEntropy(Loss):
#     def compute_loss(self, inputs, mask=None):
#         y_true, y_mask, y_pred = inputs
#         y_true = y_true[:, 1:]
#         y_mask = y_mask[:, 1:]
#         y_pred = y_true[:, :-1]
        
#         loss = K.sparse_categorical_crossentropy(y_true, y_pred)
#         loss = K.sum(loss * y_mask) / K.sum(y_mask)
#         return loss
        
class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉输入部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

In [11]:
# 加载Albert模型
from keras.models import Model
from tensorflow.keras.optimizers import Adam

model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='albert', application='unilm')
output = CrossEntropy(2)(model.inputs + model.outputs)

In [12]:
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             16226304    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        



In [13]:
# 自动对下联
import numpy as np


class AutoNextCouplet(AutoRegressiveDecoder):
    
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, stats):
        token_id, segment_id = inputs
        token_id = np.concatenate([token_id, output_ids], 1)
        segment_id = np.concatenate([segment_id, np.ones_like(output_ids)], 1)
        return self.last_token(model).predict([token_id, segment_id])
    
    def next_couplet(self, text, topk=1):
        max_len = 50 # albert 最大长度是312 输入和输出均限制150长度
        token_id, segment_id = tokenizer.encode(text, maxlen=max_len)
        
        output_id = self.beam_search([token_id, segment_id], topk=topk)
        return tokenizer.decode(output_id)
        

In [17]:
from keras.callbacks import Callback

next_couplet = AutoNextCouplet(start_id=None, end_id=tokenizer._token_end_id, maxlen=50)

class EvalCallback(Callback):
    def __init__(self):
        self.lowest = 1e8
        
    def on_epoch_end(self, epoch, logs=None):
        if logs['loss'] < self.lowest:
            logs['loss'] = self.lowest
            model.save_weights('weights/couplet-albert-base-best-weights.weights')
        
        self.just_show()
        
    def just_show(self):
        first = ['今日天气多云多美丽', '珍藏惟有诗三卷', '狂笔一挥天地动', '推窗问月诗何在']
        for each in first:
            print(" -", each)
            print("--", next_couplet.next_couplet(each))
            print()
            

In [18]:
next_couplet.next_couplet('今日天气多云多美丽')

'今朝风光万象大风和'

In [23]:
train_data = CoupletDataGenerator(zip(train_in, train_out))
# test_data = CoupletDataGenerator(zip(test_in, test_out))

model.fit(train_data.forfit(),epochs=5, steps_per_epoch=10000, callbacks=[EvalCallback()])

In [19]:
# model.load_weights('couplet-best-weights.weights')
# EvalCallback().just_show()

# albert tiny
# - 今日天气多云多美丽
# -- 今朝人人有意有情情

#  - 珍藏惟有诗三卷
# -- 喜见常知画一书

#  - 狂笔一挥天地动
# -- 新风再绘古今新

#  - 推窗问月诗何在
# -- 对月吟诗画不同

# # albert base
# - 今日天气多云多美丽
# -- 今朝风光万象大风和

#  - 珍藏惟有诗三卷
# -- 珍藏不无酒一壶

#  - 狂笔一挥天地动
# -- 高歌万载日月长

#  - 推窗问月诗何在
# -- 对月吟诗酒自酣


 - 今日天气多云多美丽
-- 今朝风光万象大风和

 - 珍藏惟有诗三卷
-- 珍藏不无酒一壶

 - 狂笔一挥天地动
-- 高歌万载日月长

 - 推窗问月诗何在
-- 对月吟诗酒自酣



In [22]:
next_couplet.next_couplet('清 风 凝 白 雪 ')

'明月照青山'