In [1]:
# 初始化
%load_ext autoreload
%autoreload 2
import sys
import os
os.chdir('E:\GitHub\QA-abstract-and-reasoning')
sys.path.append('E:\GitHub\QA-abstract-and-reasoning')

In [2]:
import tensorflow as tf
from pgn.layers import BahdanauAttention, Encoder, Pointer
from pgn.batcher import batcher
from utils.saveLoader import load_embedding_matrix
from utils.saveLoader import Vocab
from utils.config import VOCAB_PAD
from utils.config_gpu import config_gpu
config_gpu()

1 Physical GPUs, 1 Logical GPUs


## 构建输入

In [3]:
%run utils/params.py
# 产生输入数据
vocab = Vocab(VOCAB_PAD)
ds =iter(batcher(vocab, params))
enc_data, dec_data = next(ds)

## 以下是调试好的Decoder单元
![](https://img-blog.csdn.net/20180809142518309?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3Rob3JtYXMxOTk2/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70)

In [5]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, 
                 dec_units, batch_size, attention):
        super(Decoder, self).__init__()
        self.batch_sz = batch_size
        self.dec_units = dec_units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
                                                   trainable=False)

        self.cell = tf.keras.layers.GRUCell(units=self.dec_units,
                                            recurrent_initializer='glorot_uniform')
        
        self.attention = attention
        self.fc1 = tf.keras.layers.Dense(self.dec_units*2)
        self.fc2 = tf.keras.layers.Dense(vocab_size)
        
    def call(self, dec_input,  # (batch_size, )
             prev_dec_hidden,  # (batch_size, dec_units)
             enc_output,  # (batch_size, enc_len, enc_units)
             enc_pad_mask, # (batch_size, enc_len)
             use_coverage=True,
             prev_coverage=None):
        # 得到词向量, output[2]
        # dec_x (batch_size, embedding_dim)
        dec_x = self.embedding(dec_input)
        
        # 应用GRU单元算出dec_hidden
        # 注意cell 返回的state是一个列表，gru单元中为 [h] lstm [h, c]
        # 所以这里用[dec_hidden] 取出来，这样dec_hidden就是tensor形式了
        # dec_output (batch_size, dec_units)
        # dec_hidden (batch_size, dec_units), output[1]
        dec_output, [dec_hidden] = self.cell(dec_x, [prev_dec_hidden])
        
        # 计算注意力，得到上下文，注意力分布，coverage
        # context_vector (batch_size, enc_units), output[0]
        # attn (batch_size, enc_len), output[4]
        # coverage (batch_size, enc_len, 1), output[5]
        context_vector, attn, coverage = self.attention(dec_hidden,
                                                        enc_output,
                                                        enc_pad_mask,
                                                        use_coverage,
                                                        prev_coverage)

        # 将上一循环的预测结果跟注意力权重值结合在一起作为本次的GRU网络输入
        # dec_output (batch_size, enc_units + dec_units)
        dec_output = tf.concat([dec_output, context_vector], axis=-1)

        # 保持维度不变，其实我也不确定第一个全连接层的units该设置为多少
        # pred (batch_size, enc_units + dec_units)
        pred = self.fc1(dec_output)
        
        # pred (batch_size, vocab), output[3]
        pred = self.fc2(pred)
        
        """output
        output[0]: context_vector (batch_size, dec_units)
        output[1]: dec_hidden (batch_size, dec_units)
        output[2]: dec_x (batch_size, embedding_dim)
        output[3]: pred (batch_size, vocab_size)
        output[4]: attn (batch_size, enc_len)
        output[5]: coverage (batch_size, enc_len, 1)
        """
        return context_vector, dec_hidden, dec_x, pred, attn, coverage

$$
P_{gen} = \sigma(w_{h^*}^Th_t^*+w_s^Ts_t+w_x^Tx_t+b_{ptr})
$$

## 调试PGN模型
### PGN初始化参数

In [6]:
embedding_matrix = load_embedding_matrix()

In [7]:
# params
vocab_size = params["vocab_size"]
embedding_dim = 300
enc_units = dec_units = attn_units = 256
batch_size = 64
enc_len = 200

In [8]:
encoder = Encoder(vocab_size,
               embedding_dim,
               embedding_matrix,
               enc_units,
               batch_size)
attention = BahdanauAttention(attn_units)
decoder = Decoder(vocab_size,
               embedding_dim,
               embedding_matrix,
               enc_units,
               batch_size)
pointer = Pointer()

### PGN.call的参数

In [9]:
# (64,200)
enc_inp = enc_data["enc_input"]
dec_inp = dec_data["dec_input"]
enc_extended_inp = enc_data["extended_enc_input"]
batch_oov_len = enc_data["max_oov_len"]
enc_pad_mask = enc_data["enc_mask"]
use_coverage = True
prev_coverage=None

### 改为使用tf.TensorArray

是为了能使用`@tf.function`加速训练

[参考1](https://tensorflow.google.cn/guide/function#batching)

[参考2](https://tensorflow.google.cn/tutorials/customization/performance#gotchas)

In [10]:
# tf.TensorArray 代替 list
# predictions = []
# attentions = []
# p_gens = []
# coverages = []
predictions = tf.TensorArray(tf.float32, size=dec_inp.shape[1])
attentions = tf.TensorArray(tf.float32, size=dec_inp.shape[1])
p_gens = tf.TensorArray(tf.float32, size=dec_inp.shape[1])
coverages = tf.TensorArray(tf.float32, size=dec_inp.shape[1])

### 修改部分：
decoder内置了attention

模拟循环

In [12]:
enc_output, enc_hidden = encoder(enc_inp)
dec_hidden = enc_hidden
prev_coverage = None
for t in tf.range(dec_inp.shape[1]):
    context_vector, dec_hidden, \
    dec_x, pred, attn, prev_coverage = decoder(dec_inp[:, t],  # (batch_size, )
                                        dec_hidden,  # (batch_size, dec_units)
                                        enc_output,  # (batch_size, enc_len, enc_units)
                                        enc_pad_mask, # (batch_size, enc_len)
                                        use_coverage,
                                        prev_coverage)
    p_gen = pointer(context_vector, dec_hidden, dec_x)
    
    predictions.write(t, pred)
    attentions.write(t, attn)
    p_gens.write(t, p_gen)
    coverages.write(t, prev_coverage)

In [13]:
predictions.element_shape, \
attentions.element_shape, \
p_gens.element_shape, \
coverages.element_shape

(TensorShape([64, 32233]),
 TensorShape([64, 200]),
 TensorShape([64, 1]),
 TensorShape([64, 200, 1]))

### for循环之后计算final_dists

$$
P(w) = p_{gen}P_{vocab}(w)+(1-P_{gen})\sum_{i:w_i=w}a_i^t
$$

In [27]:
from pgn.model import _calc_final_dist

In [22]:
fd = _calc_final_dist(enc_extended_inp,
                     tf.transpose(predictions.stack(), perm=[1, 0, 2]), 
                      tf.transpose(attentions.stack(), perm=[1, 0, 2]), 
                      tf.transpose(p_gens.stack(), perm=[1, 0, 2]), 
                      batch_oov_len, 
                      vocab_size,
                      batch_size)
fd.shape

TensorShape([64, 39, 32242])

## 调试完毕,调包测试代码
修改pgn.model

In [3]:
# from pgn.layers import Encoder, Decoder, BahdanauAttention, Pointer
import tensorflow as tf
from pgn.batcher import batcher
from pgn.model import PGN
from utils.saveLoader import Vocab
from utils.config import VOCAB_PAD
from utils.config_gpu import config_gpu
from pgn.model import _calc_final_dist
config_gpu()

1 Physical GPUs, 1 Logical GPUs


In [4]:
%run utils/params.py
# 产生输入数据
vocab = Vocab(VOCAB_PAD)
ds =iter(batcher(vocab, params))
enc_data, dec_data = next(ds)

In [5]:
model = PGN(params)
enc_inp = enc_data["enc_input"]
dec_inp = dec_data["dec_input"]
enc_extended_inp = enc_data["extended_enc_input"]
batch_oov_len = enc_data["max_oov_len"]
enc_pad_mask = enc_data["enc_mask"]

### 调用model.call()

In [7]:
final_dist,attentions, coverages = model(enc_inp, dec_inp, enc_extended_inp, 
                       batch_oov_len, enc_pad_mask)

In [8]:
final_dist.shape, attentions.shape, coverages.shape

(TensorShape([64, 39, 32237]),
 TensorShape([64, 39, 200]),
 TensorShape([64, 39, 200]))

#### 测试损失函数

In [82]:
from pgn.loss import loss_function, loss_function2, _coverage_loss, calc_loss

In [50]:
target = dec_data["dec_target"]
padding_mask = dec_data["dec_mask"]

log loss

In [None]:
loss_function(target, final_dist, padding_mask)

cov_loss

$t$是decoder的时间步 (0, dec_len)

$i$是encoder的时间步 (0, enc_len)

$$
covloss_t = \sum_i min(a_i^t, c_i^t)
$$

In [83]:
_coverage_loss(attentions, coverages, padding_mask)

<tf.Tensor: id=8017, shape=(), dtype=float32, numpy=0.6298077>

In [85]:
calc_loss(target, final_dist, padding_mask, attentions, coverages, cov_loss_wt=0.5)

(<tf.Tensor: id=8062, shape=(), dtype=float32, numpy=6.849601>,
 <tf.Tensor: id=8051, shape=(), dtype=float32, numpy=6.534697>,
 <tf.Tensor: id=8059, shape=(), dtype=float32, numpy=0.6298077>)

### 调用Encoder

In [8]:
enc_output, enc_hidden = model.encoder(enc_inp)
dec_hidden = enc_hidden

### 调用Decoder

In [10]:
# enc_output 由Encoder获得
# prev_dec_hidden 由Encoder获得
# enc_pad_mask 前面有了
dec_input = tf.constant([vocab.word2id[vocab.START_DECODING]] * 64)
# dec_input = tf.expand_dims([vocab.word2id[vocab.START_DECODING]] * 64, axis=1)

In [11]:
context_vector, dec_hidden, \
dec_x, pred, attn, coverage = model.decoder(dec_input, dec_hidden, 
                                            enc_output, enc_pad_mask)

In [12]:
vocab_size = params["vocab_size"]
batch_size = params["batch_size"]
vocab_size,batch_size

(32233, 64)

### 计算单步decoder的final_dist

In [13]:
p_gen = model.pointer(context_vector, dec_hidden, dec_x)

In [15]:
# 保证pred attn p_gen的参数为3D的
final_dist = _calc_final_dist(enc_extended_inp,
                     tf.expand_dims(pred, 1), 
                     tf.expand_dims(attn, 1), 
                     tf.expand_dims(p_gen, 1), 
                      batch_oov_len, 
                      vocab_size,
                      batch_size)
final_dist.shape

TensorShape([64, 1, 32237])

### 单步decoder封装成函数

In [16]:
# 构造第一个dec的输入
dec_input = tf.constant([vocab.word2id[vocab.START_DECODING]] * 64)

# 事先计算好enc的输出
enc_output, enc_hidden = model.encoder(enc_inp)
dec_hidden = enc_hidden


def decode_one_step(dec_input, dec_hidden, enc_output,
                   enc_pad_mask, prev_coverage, use_coverage=True):
    # 开始decoder
    context_vector, dec_hidden, \
    dec_x, pred, attn, coverage = model.decoder(dec_input, dec_hidden, enc_output,
                                  enc_pad_mask, prev_coverage, use_coverage)
    
    # 计算p_gen
    p_gen = model.pointer(context_vector, dec_hidden, dec_x)
    
    # 保证pred attn p_gen的参数为3D的
    final_dist = _calc_final_dist(enc_extended_inp,
                         tf.expand_dims(pred, 1), 
                         tf.expand_dims(attn, 1), 
                         tf.expand_dims(p_gen, 1), 
                          batch_oov_len, 
                          params["vocab_size"],
                          params["batch_size"])
    
    return final_dist, dec_hidden, coverage

In [17]:
final_dist, dec_hidden, coverage = decode_one_step(dec_input, dec_hidden, enc_output,
                                       enc_pad_mask, use_coverage=True, prev_coverage=None)