In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('../Dataset/TextSummary/train.csv')
dataX = data['document']
dataY = data['summary']

In [3]:
def CompareFunction(string):
    x = string.find('#')
    if x == -1:
        return True
    return False

In [4]:
X, Y = [], []
for line in dataX:
    X.append(list(filter(CompareFunction, line.split())))

for line in dataY:
    Y.append(list(filter(CompareFunction, line.split())))

In [5]:
specialWords = ['<unk>', '<start>', '<eos>', '<pad>']
words = []
word2idx = {}
idx2word = {}
idx = 0

for word in specialWords:
    words.append(word)
    word2idx[word] = idx
    idx2word[idx] = word
    idx += 1

for line in X:
    for word in line:
        if word not in words:
            words.append(word)
            word2idx[word] = idx
            idx2word[idx] = word
            idx += 1

for line in Y:
    for word in line:
        if word not in words:
            words.append(word)
            word2idx[word] = idx
            idx2word[idx] = word
            idx += 1

In [6]:
def TurnDataIntoIndex(inputs, maxLength):
    indexs = []
    for sentence in inputs:
        line = [word2idx[word] for word in sentence]
        indexs.append([word2idx['<start>']] + line[0 : maxLength] + [word2idx['<eos>']] + [word2idx['<pad>']] * (maxLength - len(line)))
    return indexs

trainX = TurnDataIntoIndex(X, 70)
trainX = np.array(trainX)
dictLength = 33400

In [7]:
'''
通过trainX -> input 最后得到 Output
'''
inputY, outputY = [], []
maxLengthY = 20
for sentence in Y:
    line = [word2idx[word] for word in sentence]
    inputY.append([word2idx['<start>']] + line[0 : maxLengthY] + [word2idx['<pad>']] * (maxLengthY - len(line)))
    outputY.append(line[0 : maxLengthY] + [word2idx['<eos>']] + [word2idx['<pad>']] * (maxLengthY - len(line)))

inputY = np.array(inputY)
outputY = np.array(outputY)

In [8]:
from tensorflow.keras import Model
class Seq2seqModel(Model):
    def __init__(self, dictSize, outputDim, hiddenUnits):
        super(Seq2seqModel, self).__init__()
        self.encoderEmbedding = tf.keras.layers.Embedding(
            input_dim = dictSize,
            output_dim = outputDim
        )
        # self.encoderLSTM = tf.keras.layers.LSTM(hiddenUnits, return_sequences = True, return_state = True)

        self.encoderLSTM = [
            tf.keras.layers.LSTM(hiddenUnits, return_sequences = True, return_state = True),
            tf.keras.layers.LSTM(hiddenUnits, return_sequences = True, return_state = True),
        ]

        self.decoderEmbedding = tf.keras.layers.Embedding(
            input_dim = dictSize,
            output_dim = outputDim
        )
        # self.decoderLSTM = tf.keras.layers.LSTM(
        #     hiddenUnits,
        #     return_sequences = True,
        #     return_state = True,
        #     name = 'Decoder'
        # )
        self.decoderLSTM = [
            tf.keras.layers.LSTM(hiddenUnits, return_sequences = True, return_state = True),
            tf.keras.layers.LSTM(hiddenUnits, return_sequences = True, return_state = True),
        ]
        
        self.attention = tf.keras.layers.Attention()

        self.outputDense = tf.keras.layers.Dense(
            dictSize,
            activation = 'softmax',
            name = 'OutputDense'
        )

    '''
    Inputs[0] -> TrainX
    Inputs[1] -> InputY
    '''
    def call(self, inputs):
        encoderEmbeddingOutput = self.encoderEmbedding(inputs[0])
        encoderOutput, encoderStateH, encoderStateC = self.encoderLSTM[0](encoderEmbeddingOutput)

        encoderOutput, encoderStateH, encoderStateC = self.encoderLSTM[1](
            encoderOutput,
            initial_state = [
                encoderStateH,
                encoderStateC
            ]
        )

        decoderEmbeddingOutput = self.decoderEmbedding(inputs[1])
        decoderOutput, decoderStateH, decoderStateC = self.decoderLSTM[0](
            decoderEmbeddingOutput,
            initial_state = [
                encoderStateH,
                encoderStateC
            ]
        )

        decoderOutput, decoderStateH, decoderStateC = self.decoderLSTM[0](
            decoderEmbeddingOutput,
            initial_state = [
                decoderStateH,
                decoderStateC
            ]
        )


        attentionOutput = self.attention([decoderOutput, encoderOutput])

        denseOutput = self.outputDense(attentionOutput)

        return denseOutput
    

    def ModelInferenceSingle(self, inputs, word2idx, idx2word, maxLengthY):
        sequence = [word2idx['<start>']]
        result = []
        states = {}

        encoderEmbeddingOutput = self.encoderEmbedding(inputs)
        encoderOutput, encoderStateH, encoderStateC = self.encoderLSTM[0](encoderEmbeddingOutput)

        encoderOutput, encoderStateH, encoderStateC = self.encoderLSTM[1](
            encoderOutput,
            initial_state = [
                encoderStateH,
                encoderStateC
            ]
        )

        states[0] = [encoderStateH, encoderStateC]

        for i in range(1, maxLengthY + 1):
            # --- Update Decoder
            decoderEmbeddingOutput = self.decoderEmbedding(np.array([sequence]))
            decoderOutput, decoderStateH, decoderStateC = self.decoderLSTM[0](
                decoderEmbeddingOutput,
                initial_state = [
                    states[i - 1][0],
                    states[i - 1][1]
                ]
            )

            decoderOutput, decoderStateH, decoderStateC = self.decoderLSTM[1](
                decoderEmbeddingOutput,
                initial_state = [
                    decoderStateH,
                    decoderStateC
                ]
            )

            # --- Ca Output
            attentionOutput = self.attention([decoderOutput, encoderOutput])
            modelOutput = self.outputDense(attentionOutput)
            outputIdx = np.argmax(modelOutput[0][i - 1])
            wordRes = idx2word[outputIdx]

            # --- Update Variables
            states[i] = [decoderStateH, decoderStateC]
            sequence.append(outputIdx)
                    
            if wordRes == '<eos>':
                break
            
            result.append(wordRes)
        
        return result

In [9]:
model = Seq2seqModel(dictLength, 128, 128)

model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = 'adam',
    metrics = ['sparse_categorical_accuracy']
)
model.build(input_shape = (None, len(trainX), 128))
model.load_weights("../../SavedModel/Seq2seqWithAttentionWeiht2.h5")

In [10]:
for _ in range(5):
    history = model.fit(
        [trainX, inputY],
        outputY,
        batch_size = 32,
        epochs = 1
    )
    model.save_weights("../../SavedModel/Seq2seqWithAttentionWeiht2.h5")



In [11]:
testData = pd.read_csv('../Dataset/TextSummary/test.csv')
dataX = testData['document']
dataY = testData['summary']

X = []
for line in dataX:
    X.append(list(filter(CompareFunction, line.split())))

testX = []
for sentence in X:
    line = []
    for word in sentence:
        if word in words:
            line.append(word2idx[word])
        else:
            line.append(word2idx['<unk>'])
    testX.append([word2idx['<start>']] + line[0 : 70] + [word2idx['<eos>']] + [word2idx['<pad>']] * (70 - len(line)))
testX = np.array(testX)    

In [12]:
n = 1
predictY = model.ModelInferenceSingle(np.array([trainX[n]]), word2idx, idx2word, 20)
print(' '.join(predictY[1: ]))
print()
print(' '.join(Y[n]))

<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

u.s. arrests wife and daughter of saddam deputy ; troops prepare for thanksgiving
