In [1]:
import warnings
warnings.filterwarnings("ignore")
import re
import string
from unicodedata import normalize
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Reshape
from keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import keras.backend as K
import keras
import numpy as np
import random
import tqdm
import matplotlib.pyplot as plt



Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)



In [3]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [4]:
# customize softmax function
def softmax(x, axis=1):
    """
    Softmax activation function.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

        

In [5]:
def one_step_attention(a, s_prev):
    """
    Attention mechanism, return weighted Context Vector
    
    @param a: hidden state of BiRNN
    @param s_prev: last state of Decoder LSTM
    
    Returns:
    context: weighted Context Vector
    """
    
    # repeat s_prev Tx times, one for each word
    s_prev = repeator(s_prev)
    # connect BiRNN hidden state to s_prev
    concat = concatenator([a, s_prev])
    # compute energies
    e = densor_tanh(concat)
    energies = densor_relu(e)
    # compute weights
    alphas = activator(energies)
    # get weighted Context Vector
    context = dotor([alphas, a])
    
    return context

In [6]:
def pretrained_embedding_layer(word_to_vec_map, source_vocab_to_int):
    """
    build Embedding layer and pretrain word embedding

    @param word_to_vec_map: word to vector
    @param word_to_index: word to one hot encoding
    """
    
    vocab_len = len(source_vocab_to_int) + 1        # Keras Embedding API +1
    emb_dim = word_to_vec_map["the"].shape[0]
    
    # initialize embedding matrix
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # fit wordvec to embedding layers
    for word, index in source_vocab_to_int.items():
        word_vector = word_to_vec_map.get(word, np.zeros(emb_dim))
        emb_matrix[index, :] = word_vector

    # build embedding layer, cannot be trained
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)

    # build embedding layer
    embedding_layer.build((None,))
    
    # set weights
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [7]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)

# choose sample size
# considering the time consuming, only take 30000 samples as training and testing data set 
n_train = 30000
clean_pairs = clean_data(pairs)[0:n_train, :]
# clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = clean_pairs[:, 1]

# create word level input sequence
input_sequences = []
for t in input_texts:
    input_sequences.append(t.split())
# create word level target sequence
target_sequences = []
for t in target_texts:
    cur_seq = t.split()
    cur_seq.append('<eos>') # add end sentence lable
    target_sequences.append(cur_seq)

max_encoder_seq_length = max(len(line) for line in input_sequences)
max_decoder_seq_length = max(len(line) for line in target_sequences)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)
source_text_to_int, source_vocab_to_int = text2sequences(max_encoder_seq_length, X_train)
target_text_to_int, target_vocab_to_int = text2sequences(max_decoder_seq_length, y_train)

source_vocab_to_int['<pad>'] = 0
target_vocab_to_int['<pad>'] = 0

source_int_to_vocab = {word: idx for idx, word in source_vocab_to_int.items()}
target_int_to_vocab = {word: idx for idx, word in target_vocab_to_int.items()}

X = source_text_to_int
Y = target_text_to_int

In [9]:
X.shape

(24000, 7)

In [10]:
source_text_to_int.shape

(24000, 7)

In [11]:
# onehot encoding
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(source_vocab_to_int)), X)))

In [12]:
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(target_vocab_to_int)), Y)))

In [13]:
Tx = max_encoder_seq_length 
Ty = max_decoder_seq_length

In [14]:
# load pretrained word embedding from glove
with open("glove.6B/glove.6B.50d.txt", 'r') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
        line = line.strip().split()
        curr_word = line[0]
        words.add(curr_word)
        word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

# Embedding layer
embedding_layer = pretrained_embedding_layer(word_to_vec_map, source_vocab_to_int)

Instructions for updating:
Colocations handled automatically by placer.


In [15]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor_tanh = Dense(32, activation = "tanh")
densor_relu = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights')
dotor = Dot(axes = 1)

In [16]:
n_a = 32 # The hidden size of Bi-LSTM
n_s = 128 # The hidden size of LSTM in Decoder
decoder_LSTM_cell = LSTM(n_s, return_state=True)
output_layer = Dense(len(target_vocab_to_int), activation=softmax)

# define model layers
reshapor = Reshape((1, len(target_vocab_to_int)))
concator = Concatenate(axis=-1)

In [17]:
def build_model(Tx, Ty, n_a, n_s, source_vocab_size, target_vocab_size):
    # input layer
    X = Input(shape=(Tx,))
    # Embedding layer
    embed = embedding_layer(X)
    # initialize Decoder LSTM
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    
     # Decoder input for LSTM layer
    out0 = Input(shape=(target_vocab_size, ), name='out0')
    out = reshapor(out0)
    
    s = s0
    c = c0
    
    # save outputs
    outputs = []
    
    # define Bi-LSTM
    a = Bidirectional(LSTM(n_a, return_sequences=True))(embed)
    
    # Decoder, iterate max_decoder_seq_length rounds, each iteration generates one result
    for t in range(Ty):
    
        # get Context Vector
        context = one_step_attention(a, s)
        
        # concat Context Vector and the previous translated result
        context = concator([context, reshapor(out)])
        s, _, c = decoder_LSTM_cell(context, initial_state=[s, c])
        
        # connect lstm output and dense layer
        out = output_layer(s)
        
        # save output result
        outputs.append(out)
    
    model = Model([X, s0, c0, out0], outputs)
    
    return model

In [18]:
def make_prediction(sentence, Tx):
    """
    make predictions on given sentences
    """
    # encoding 
    unk_idx = source_vocab_to_int["<unk>"]
    word_idx = [source_vocab_to_int.get(word, unk_idx) for word in sentence]
    word_idx = np.array(word_idx + [0] * (Tx - len(word_idx)))
    
    # translated results
    preds = model.predict([word_idx.reshape(-1,Tx), s0, c0, out0])
    predictions = np.argmax(preds, axis=-1)
    
    # to words
    pred_words = [target_int_to_vocab.get(idx[0], "<unk>") for idx in predictions]
    pred_string = " ".join(pred_words)
    pred_french = pred_string.split('<eos>')[0]
    return pred_french.split()

In [19]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.logs = []

    def on_batch_end(self, batch, logs={}):
        self.logs.append(logs['loss'])
        if self.i % 200 == 0:
            print('Loss for {} iteration:'.format(self.i), logs['loss'])
        self.i += 1
                

In [20]:
model = build_model(Tx, Ty, n_a, n_s, len(source_vocab_to_int), len(target_vocab_to_int))
his = LossHistory()
out = model.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.001),
                    metrics=['accuracy'],
                    loss='categorical_crossentropy')


m = X.shape[0] # num of training sample
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
out0 = np.zeros((m, len(target_vocab_to_int)))
outputs = list(Yoh.swapaxes(0,1))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [21]:
model.load_weights("final_seq2seq_model_1121.h5")

In [49]:
model.fit([X, s0, c0, out0], outputs, 
          epochs=20, 
          batch_size=128,
          verbose=0,
          callbacks=[his]
         )
# save weights
model.save_weights("final_seq2seq_model_1121.h5") # 100 epoch in total

Loss for 0 iteration: 15.428822
Loss for 200 iteration: 15.181914
Loss for 400 iteration: 16.751379
Loss for 600 iteration: 15.314741
Loss for 800 iteration: 14.052397
Loss for 1000 iteration: 14.170492
Loss for 1200 iteration: 15.35292
Loss for 1400 iteration: 13.94512
Loss for 1600 iteration: 15.068814
Loss for 1800 iteration: 13.895139
Loss for 2000 iteration: 15.116055
Loss for 2200 iteration: 14.029553
Loss for 2400 iteration: 15.572563
Loss for 2600 iteration: 14.693933
Loss for 2800 iteration: 14.49578
Loss for 3000 iteration: 14.497648
Loss for 3200 iteration: 15.241299
Loss for 3400 iteration: 13.200629
Loss for 3600 iteration: 15.295173


In [22]:
test_list = random.sample(range(len(X_test)), 20) # randomly choose 20 sample from test set

In [50]:
sentence = ['i', 'want', 'this', 'cat']
unk_idx = source_vocab_to_int["<unk>"]
word_idx = [source_vocab_to_int.get(word, unk_idx) for word in sentence]
word_idx = np.array(word_idx + [0] * (Tx - len(word_idx)))
# translated results
preds = model.predict([word_idx.reshape(-1,Tx), s0, c0, out0])
predictions = np.argmax(preds, axis=-1) 
# to words
pred_words = [target_int_to_vocab.get(idx[0], "<unk>") for idx in predictions]
pred_string = " ".join(pred_words)
print(pred_string)
#     pred_french = pred_string.split('<eos>')[0]

je veux le <eos> <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [51]:
for idx in test_list:
    candidate = make_prediction(X_test[idx], Tx)
    reference = y_test[idx][:-1]
    print('-')
    print('English:	', X_test[idx])
    print('French(true):	', reference)
    print('French(pred):	', candidate)

-
English:	 ['tom', 'has', 'a', 'dog']
French(true):	 ['tom', 'a', 'un', 'chien']
French(pred):	 ['tom', 'est', 'un', 'un']
-
English:	 ['stay', 'down']
French(true):	 ['reste', 'baisse']
French(pred):	 ['reste']
-
English:	 ['youre', 'fun']
French(true):	 ['tes', 'marrante']
French(pred):	 ['vous', 'etes']
-
English:	 ['do', 'you', 'believe', 'him']
French(true):	 ['estce', 'que', 'tu', 'le', 'crois', 'lui']
French(pred):	 ['tu', 'que']
-
English:	 ['here', 'look', 'at', 'this']
French(true):	 ['voila', 'regarde', 'ca']
French(pred):	 ['regardez', 'la']
-
English:	 ['he', 'doesnt', 'want', 'it']
French(true):	 ['il', 'nen', 'veut', 'pas']
French(pred):	 ['il', 'ne', 'pas', 'pas']
-
English:	 ['itll', 'break']
French(true):	 ['ca', 'va', 'rompre']
French(pred):	 ['tout', 'est', 'monde']
-
English:	 ['its', 'the', 'best']
French(true):	 ['cest', 'le', 'meilleur']
French(pred):	 ['cest', 'un']
-
English:	 ['lets', 'take', 'a', 'walk']
French(true):	 ['allons', 'marcher']
French(pred):	 [

In [52]:
from nltk.translate.bleu_score import sentence_bleu
# evaluate model by test dataset
# print bleu score 
trueFrench = []
predFrench = []
sum_score = 0
for i in range(len(X_test)):
    candidate = make_prediction(X_test[i], Tx)
    reference = y_test[i][:-1]
    predFrench.append(candidate)
    trueFrench.append(reference)
    score = sentence_bleu([reference], candidate, weights=(1,0,0,0))
    sum_score += score
avg = sum_score / len(X_test)
print('SCORE IS :', avg)

SCORE IS : 0.29483933604682255
