In [23]:
import re
import warnings
warnings.filterwarnings("ignore")
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Reshape
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import keras.backend as K
import keras
import numpy as np
import random
import tqdm
import matplotlib.pyplot as plt
import string
from unicodedata import normalize
from keras.utils.vis_utils import model_to_dot, plot_model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [3]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)

In [4]:
# choose sample size
n_train = 20000
clean_pairs = clean_data(pairs)[0:n_train, :]
# clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = clean_pairs[:, 1]

# create word level input sequence
input_sequences = []
for t in input_texts:
    input_sequences.append(t.split())
# create word level target sequence
target_sequences = []
for t in target_texts:
    cur_seq = ['\t']
    cur_seq.extend(t.split())
    cur_seq.append('\n')
    target_sequences.append(cur_seq)

In [5]:
max_encoder_seq_length = max(len(line) for line in input_sequences)
max_decoder_seq_length = max(len(line) for line in target_sequences)

In [6]:
print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))
print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

Length of input_texts:  (20000,)
Length of target_texts: (20000,)
max length of input  sentences: 5
max length of target sentences: 14


In [7]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [12]:
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, input_sequences)

In [13]:
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, target_sequences)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, X_train)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, y_train)

In [15]:
print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (16000, 5)
shape of input_token_index: 3220
shape of decoder_input_seq: (16000, 14)
shape of target_token_index: 6200


In [66]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 3221
num_decoder_tokens: 6201


In [67]:
X = np.array(encoder_input_seq)
Y = np.array(decoder_input_seq)
X_onehot = np.array(list(map(lambda x: to_categorical(x, num_classes=num_encoder_tokens),X)))
Y_onehot = np.array(list(map(lambda x: to_categorical(x, num_classes=num_decoder_tokens),Y)))

In [68]:
X_onehot.shape

(16000, 5, 3221)

In [69]:
Y_onehot.shape

(16000, 14, 6201)

In [19]:
# load pretrain word embedding vectors 'glove'
# use 50 dimention as  
with open("glove.6B/glove.6B.50d.txt", 'r') as f:
    words = set()
    word_to_vec = {}
    for line in f:
        line = line.strip().split()
        cur_word = line[0]
        words.add(cur_word)
        word_to_vec[cur_word] = np.array(line[1:], dtype=np.float64)

In [20]:
def pretrain(word_to_vec, num_encoder_tokens):
    vocab_len = num_encoder_tokens
    embedding_dim = 50
    embedding_matrix = np.zeros((vocab_len, embedding_dim)) # initialize embedding matrix
    
    for word, index in input_token_index.items():
        word_vector = word_to_vec.get(word, np.zeros(embedding_dim))
        embedding_matrix[index, :] = word_vector
    
    embedding_layer = Embedding(vocab_len, embedding_dim, trainable=False)
    embedding_layer.build((None, ))
    embedding_layer.set_weights([embedding_matrix])
    return embedding_layer

In [24]:
embedding_layer = pretrain(word_to_vec, num_encoder_tokens)

In [26]:
X_onehot.shape[1:]

(5, 3221)

In [27]:
encoder_input_x = Input(shape=(X_onehot.shape[1:]), name='encoder_input_x')

In [28]:
decoder_input_x = Input(shape=(Y_onehot.shape[1:]), name='decoder_input_x')

In [71]:
decoder_input_x.shape

TensorShape([None, 14, 6201])

In [72]:
# customize softmax function
def softmax(x, axis=1):
    """
    Softmax activation function.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [73]:
# customize layers
repeator = RepeatVector(max_encoder_seq_length)
concatenator = Concatenate(axis=-1)
densor_tanh = Dense(32, activation = "tanh")
densor_relu = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights')
dotor = Dot(axes = 1)

In [74]:
def one_step_attention(a, s_prev):
    """
    Attention mechanism，return weighted Context Vector
    
    @param a: BiRNN hidden state
    @param s_prev: Decoder LSTM last hidden output
    
    Returns:
    context: weighted Context Vector
    """
    
    # repeat max_encoder_seq_length times 
    s_prev = repeator(s_prev)
    # connect BiRNN hidden state and s_prev
    concat = concatenator([a, s_prev])
    # calculate energies
    e = densor_tanh(concat)
    energies = densor_relu(e)
    # compute weights
    alphas = activator(energies)
    # weighted Context Vector
    context = dotor([alphas, a])
    
    return context

In [75]:
n_a = 32 # The hidden size of Bi-LSTM
n_s = 128 # The hidden size of LSTM in Decoder
decoder_LSTM_cell = LSTM(n_s, return_state=True)
output_layer = Dense(num_decoder_tokens, activation=softmax)

In [76]:
reshapor = Reshape((1, num_decoder_tokens))
concator = Concatenate(axis=-1)

In [78]:
def model(max_encoder_seq_length, max_decoder_seq_length, n_a, n_s, num_encoder_tokens, num_decoder_tokens):
    X = Input(shape=(max_encoder_seq_length,))
    # Embedding layer
    embed = embedding_layer(X)
    # Decoder initialize
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    
    # Decoder input for LSTM layer
    out0 = Input(shape=(num_decoder_tokens, ), name='out0')
    out = reshapor(out0)
    
    s = s0
    c = c0
    
    # save output results
    outputs = []
    
    # 定义Bi-LSTM
    a = Bidirectional(LSTM(n_a, return_sequences=True))(embed)
    
    # Decoder, iterate max_decoder_seq_length rounds, each iteration generates one result
    for t in range(max_decoder_seq_length):
    
        # get Context Vector
        context = one_step_attention(a, s)
        
        # concat Context Vector and the previous translated result
        context = concator([context, reshapor(out)])
        s, _, c = decoder_LSTM_cell(context, initial_state=[s, c])
        
        # connect lstm output and dense layer
        out = output_layer(s)
        
        # save output result
        outputs.append(out)
    
    model = Model([X, s0, c0, out0], outputs)
    
    return model

In [79]:
model = model(max_encoder_seq_length, max_decoder_seq_length, n_a, n_s, num_encoder_tokens, num_decoder_tokens)


In [80]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 5, 50)        161050      input_2[0][0]                    
__________________________________________________________________________________________________
s0 (InputLayer)                 [(None, 128)]        0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 5, 64)        21248       embedding[1][0]                  
____________________________________________________________________________________________

In [81]:
out = model.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001),
                    metrics=['accuracy'],
                    loss='categorical_crossentropy')

In [82]:
m = X.shape[0] # num of training sample
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
out0 = np.zeros((m, num_decoder_tokens))
outputs = list(Y_onehot.swapaxes(0,1))

In [None]:
# 训练模型
model.fit([X, s0, c0, out0], outputs, epochs=2, batch_size=128)

Train on 16000 samples
Epoch 1/2
Epoch 2/2