In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import sys

sys.path.append('../')  # 返回notebook的上一级目录
# sys.path.append('E:\GitHub\QA-abstract-and-reasoning')  # 效果同上

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
np.set_printoptions(suppress=True)
from utils.loader import *
from utils.config import *

from gensim.models.word2vec import LineSentence, Word2Vec
import tensorflow as tf
# from model_layer import seq2seq_model
import time

# 加载数据

In [3]:
train_x,train_y,test_x = load_dataset()  # 数据集
vocab_index,index_vocab = load_vocab(VOCAB_INDEX_PAD)  # vocab
embedding_matrix = np.loadtxt(EMBEDDING_MATRIX_PAD)  # 预训练层

In [4]:
# 输入的长度  train_X.shape -> (82871, 261)
input_length = train_x.shape[1]
# 输出的长度  train_Y.shape -> (82871, 34)
output_sequence_length = train_y.shape[1]
# 词表大小
vocab_size=len(vocab_index)
print("输入长度：{}\n输出长度：{}\n词表大小：{}".format(input_length, output_sequence_length, vocab_size))

输入长度：260
输出长度：33
词表大小：32901


## 1 基本参数设置

In [5]:
# 取部分数据进行训练
sample_num=640
train_X=train_x[:sample_num]
train_Y=train_y[:sample_num]

In [6]:
# 训练集的长度
BUFFER_SIZE = len(train_X)

# 输入的长度
max_length_inp=train_X.shape[1]
# 输出的长度
max_length_targ=train_Y.shape[1]

BATCH_SIZE = 64

# 训练一轮需要迭代多少步
steps_per_epoch = len(train_X)//BATCH_SIZE

# 词向量维度
embedding_dim = 300

# 隐藏层单元数
units = 1024

# 词表大小
vocab_size = len(vocab_index)

# 构建训练集
dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## 2 构建Encoder

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim ,embedding_matrix , enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units # whats this
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],trainable=False)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        print("after embedding",x.shape)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        # (一批数据大小, 隐层的维数)
        return tf.zeros((self.batch_sz, self.enc_units))

In [8]:
vocab_size, embedding_dim, units, BATCH_SIZE

(32901, 300, 1024, 64)

In [9]:
embedding_matrix.shape

(32901, 300)

In [118]:
encoder = Encoder(vocab_size, embedding_dim,embedding_matrix, units, BATCH_SIZE)

In [119]:
x = tf.cast(train_x[:64], dtype=tf.int32)
print(x.shape)
x

(64, 260)


<tf.Tensor: id=9734, shape=(64, 260), dtype=int32, numpy=
array([[32897,   403,   986, ..., 32900, 32900, 32900],
       [32897,   790, 32898, ..., 32900, 32900, 32900],
       [32897,  1453,    82, ...,    31,     2, 32899],
       ...,
       [32897,  4395, 29309, ..., 32900, 32900, 32900],
       [32897,   167,     7, ..., 32900, 32900, 32900],
       [32897,  2142,   954, ..., 32900, 32900, 32900]])>

In [120]:
sample_hidden = encoder.initialize_hidden_state()
sample_hidden

<tf.Tensor: id=9737, shape=(64, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [121]:
# 感觉这个output的shape很像权重
sample_output, sample_hidden = encoder(x, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

after embedding (64, 260, 300)
Encoder output shape: (batch size, sequence length, units) (64, 260, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


<img src="picture/gru.png" width = "50%" height = "50%" />

In [14]:
len(encoder.weights)

4

## 3 构建Attention

In [15]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        
        # query为上次的GRU隐藏层
        # values为编码器的编码结果enc_output
        # 在seq2seq模型中，St是后面的query向量，而编码过程的隐藏状态hi是values。
        hidden_with_time_axis = tf.expand_dims(query, 1)

        
        # 计算注意力权重值
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # # 使用注意力权重*编码器输出作为返回值，将来会作为解码器的输入
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
attention_layer = BahdanauAttention(10)
context_vector, attention_weights = attention_layer(sample_hidden, sample_output)

print("context_vector shape: (batch size, units) {}".format(context_vector.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

context_vector shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 260, 1)


In [17]:
w1 = tf.keras.layers.Dense(10)
w2 = tf.keras.layers.Dense(10)
v = tf.keras.layers.Dense(1)

In [18]:
sample_output.shape

TensorShape([64, 260, 1024])

一个样本是260行1024列的

In [19]:
w1(sample_output).shape

TensorShape([64, 260, 10])

输出是260行10列的

In [20]:
w1.weights[0].shape

TensorShape([1024, 10])

## 4 构建Decoder

In [21]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,embedding_matrix, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],trainable=False)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)  # 为了softmax层数要保持一致

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # 使用上次的隐藏层（第一次使用编码器隐藏层）、编码器输出计算注意力权重
        # enc_output shape == (batch_size, max_length, hidden_size)
        print("input shape:")
        print("x:",x.shape)
        print("hidden:",(hidden.shape))
        print("enc_output:",(enc_output.shape))
        
        context_vector, attention_weights = self.attention(hidden, enc_output)
        print("attenion output shape:")
        print("context_vector:",(context_vector.shape))
        print("attention_weights:",(attention_weights.shape))
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        print("x after embedding:",(x.shape))
        # 将上一循环的预测结果跟注意力权重值结合在一起作为本次的GRU网络输入
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        print("x after concat:",(x.shape))
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        print("after gru")
        print("output:",(output.shape))
        print("state:",(state.shape))
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        print("output after reshape:",(output.shape))
        # output shape == (batch_size, vocab)
        x = self.fc(output)
        print("x after fc:",(x.shape))
        return x, state, attention_weights

### **测试**

In [22]:
decoder = Decoder(vocab_size, embedding_dim,embedding_matrix, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

input shape:
x: (64, 1)
hidden: (64, 1024)
enc_output: (64, 260, 1024)
attenion output shape:
context_vector: (64, 1024)
attention_weights: (64, 260, 1)
x after embedding: (64, 1, 300)
x after concat: (64, 1, 1324)
after gru
output: (64, 1, 1024)
state: (64, 1024)
output after reshape: (64, 1024)
x after fc: (64, 32901)
Decoder output shape: (batch_size, vocab size) (64, 32901)


## 优化器和损失函数

[SparseCategoricalCrossentropy](https://tensorflow.google.cn/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy)

In [83]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

pad_index=vocab_index['<PAD>']

def loss_function(real, pred):
    # 相当于把<PAD>给过滤了，词如果是<PAD>那它对应位置的mask为 False
    # real = [0, 1, 2] pad_index = 2 --> mask = [True, Ture, False]
    # 用于后面不计算<PAD>词的损失
    mask = tf.math.logical_not(tf.math.equal(real, pad_index))
    # 计算损失
    # real = [0, 1, 2] pred = [[.91,.4,.5],[.0, .88, .1],[.3, .3, .94]]
    loss_ = loss_object(real, pred)
    # bool型转float(与loss_的数据类型一致)
    mask = tf.cast(mask, dtype=loss_.dtype)
    # 不计算<PAD>词损失值
    loss_ *= mask
    # 返回损失值之和
    return tf.reduce_mean(loss_)

## 保存点设置

In [84]:
checkpoint_dir = 'data/checkpoints/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [87]:
checkpoint_prefix

'data/checkpoints/training_checkpoints\\ckpt'

# **训练**

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        # 1. 构建encoder
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        # 2. 复制
        dec_hidden = enc_hidden
        # 3. <START> * BATCH_SIZE 
        # shape: (BATCH_SIZE, 1)
        dec_input = tf.expand_dims([vocab['<START>']] * BATCH_SIZE, 1)
    
        # Teacher forcing - feeding the target as the next input
        # 这里跟decoder有点区别，是一个一个输入固定位置的词 dec_input的 shape: (BATCH_SIZE, 1)
        # 例如一批数据64句话，第一轮输入<START>,第二轮输入所有句子的第一个词...
        for t in range(1, targ.shape[1]):
            # targ.shape = (BATCH_SIZE, len_train_Y) 第二个参数是句子长度
            # decoder(x, hidden, enc_output)
            # predictions用于sotfmax的(BATCH_SIZE, VOCAB_SIZE)向量
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            # 为什么损失值是累加的: 注意这里是targ[:, t]不是targ[:t]
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            # 为下一个输入做准备
            dec_input = tf.expand_dims(targ[:, t], 1)  # shape: (BATCH_SIZE, 1)

        batch_loss = (loss / int(targ.shape[1]))

        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    # 初始化隐藏层
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # 
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 1 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))