In [1]:

import collections
import numpy as np
from keras.layers import Input, Dense, Bidirectional, LSTM
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.layers.embeddings import Embedding
import tensorflow as tf

#Load Data

In [2]:
inputsents=[]
cnsents=[]
engsents=[]
with open('/content/cmn.txt','r') as f:
  for line in f.readlines():
    pair=line.split('\t')[:2]
    pair[0]=pair[0].rstrip()
    pair[0]=pair[0].lstrip()
    pair[1]=pair[1].rstrip()
    pair[1]=pair[1].lstrip()
    inputsents.append(pair)
    cnsents.append(pair[1])
    engsents.append(pair[0])
inputsents[:5]

[['Hi.', '嗨。'],
 ['Hi.', '你好。'],
 ['Run.', '你用跑的。'],
 ['Wait!', '等等！'],
 ['Wait!', '等一下！']]

In [3]:
print(len(cnsents))
print(len(engsents))
num_of_sents=len(cnsents)
max_len_cn=max([len(s) for s in cnsents])
max_len_eng=max([len(s) for s in engsents])
print('max length of chinese sentences', max_len_cn)
print('max length of english sentences', max_len_eng)

24360
24360
max length of chinese sentences 44
max length of english sentences 163


#Preprocessing

##Tokenize Sentences

In [4]:
from keras.preprocessing.text import Tokenizer

def tokenize(x, vocab_size=10000, encode_start_end = False):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :param encode_start_end: if True, pad the start & end of sentence as separate tokens
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    if encode_start_end:
        x = ["startofsentence " + sentence + " endofsentence" for sentence in x]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    tokenized_x = tokenizer.texts_to_sequences(x)
    
    return tokenized_x, tokenizer

In [5]:
cn_tokenized, cn_tokenizer=tokenize(cnsents)
en_tokenized, en_tokenizer=tokenize(engsents, encode_start_end=True)

##padding

In [6]:
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        length = max([len(sentence) for sentence in x])
        
    padded_x = pad_sequences(x, maxlen = length, padding = 'post', truncating = 'post')
    
    return padded_x

In [7]:
padded_cn=pad(cn_tokenized, max_len_cn)
padded_en=pad(en_tokenized, max_len_eng)

cn_vocab_size=len(cn_tokenizer.word_index)
en_vocab_size=len(en_tokenizer.word_index)

print('Chinese vocab size ', cn_vocab_size)
print('English vocab size ', en_vocab_size)

print('Chinese padded sequence length', len(padded_cn[0]))
print('English padded sequence length', len(padded_en[0]))

print(padded_cn.shape)
print(padded_en.shape)
print((en_tokenizer.word_index))

Chinese vocab size  22429
English vocab size  6738
Chinese padded sequence length 44
English padded sequence length 163
(24360, 44)
(24360, 163)


#Build Model

In [8]:
en_target=padded_en[:,1:,]
padded_cn[50]

array([1665,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

In [15]:
#encoder

cn_input=padded_cn
en_input=padded_en[:,:-1,]
en_target=padded_en[:,1:,]



encoder_input=Input(shape=(None,), name='encoder_input')
emb_dim=200
embedding=Embedding(cn_vocab_size+1, emb_dim, mask_zero=True)
cn_embedding=embedding(encoder_input)
encoder_lstm=LSTM(256, activation='relu', return_state=True, name='encoder_LSTM')

encoder_outputs, last_hidden, last_cell_state=encoder_lstm(cn_embedding)

#decoder
decoder_input=Input(shape=(None, ), name='decoder_input')
embedding=Embedding(en_vocab_size+1, emb_dim, mask_zero=True)
en_embedding=embedding(decoder_input)

decoder_lstm=LSTM(256, activation='relu', return_sequences=True, return_state=True, name='decoder_LSTM')
decoder_outputs, _,_=decoder_lstm(en_embedding, initial_state=[last_hidden, last_cell_state])
decoder_dense=Dense(en_vocab_size+1, activation='softmax', name='decoder_dense')
logits=decoder_dense(decoder_outputs)

model=Model([encoder_input, decoder_input], logits)

model.compile(loss = 'sparse_categorical_crossentropy',
                        optimizer = Adam(lr = 0.002),
                        metrics = ['accuracy'])

model.fit([cn_input, en_input],
                    en_target,
                    batch_size = 128,
                    epochs = 20,
                    validation_split = 0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f016b2ade10>

In [16]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    4486000     encoder_input[0][0]              
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    1347800     decoder_input[0][0]              
____________________________________________________________________________________________

In [11]:
decoder_lstm.input_spec

[InputSpec(shape=(None, None, 200), ndim=3)]

# Inference Model

In [17]:
#encoder
last_states_encoder = [last_hidden, last_cell_state]
inference_encoder_model = Model(encoder_input, 
                                last_states_encoder)

#decoer
decoder_initial_state = [Input(shape = (256,)), Input(shape = (256,))]
all_hidden_decoder, last_hidden_decoder, last_cell_decoder = decoder_lstm(embedding(decoder_input), 
                                                                          initial_state = decoder_initial_state)

logits = decoder_dense(all_hidden_decoder)

inference_decoder_model = Model([decoder_input] + decoder_initial_state, 
                                [logits, 
                                          last_hidden_decoder, 
                                          last_cell_decoder])



In [18]:
target_id_to_word = {idx:word for word, idx in en_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    """
    Gets predictions using the final model defined above
    :param input_seq: (list) encoded english sentence (list of word ids)
    returns : translated French sentence
    """
    states_value = inference_encoder_model.predict(input_seq)
    # Initialize decoder input as a length 1 sentence containing "startofsentence",
    # --> feeding the start token as the first predicted word
    prev_word = np.zeros((1, 1, 1))
    prev_word[0, 0, 0] = en_tokenizer.word_index["startofsentence"]

    stop_condition = False
    translation = []
    decoded_sentence=[]
    while not stop_condition:
        # 1. predict the next word using decoder model
        logits, last_h, last_c = inference_decoder_model.predict([prev_word] + states_value)
        
        # 2. Update prev_word with the predicted word
        predicted_id = np.argmax(logits[0, 0, :])
        #print(decoder_input)
        #print(logits)
        predicted_word = target_id_to_word[predicted_id]
        decoded_sentence.append(predicted_word)

        # 3. Enable End Condition: (1) if predicted word is "endofsentence" OR
        #                          (2) if translated sentence reached maximum sentence length
        if (predicted_word == 'endofsentence' or len(translation) > en_target.shape[1]):
            stop_condition = True

        # 4. Update prev_word with the predicted word
        prev_word[0, 0, 0] = predicted_id

        # 5. Update initial_states with the previously predicted word's encoder output
        states_value = [last_h, last_c]

    return " ".join(decoded_sentence).replace('endofsentence', '')

In [19]:
print(cnsents[240])

不要动。


In [21]:
# TODO: Print prediction(s)
for i in [3000]:
    cn_seq = padded_cn[i].reshape(1, padded_cn.shape[1])
    print(cn_seq)
    eng_translation = decode_sequence(cn_seq)
    
    print("Chinese Sentence            : ", cnsents[i])
    print("Predicted English Translation: ", eng_translation)
    print("Correct English Translation  : ", engsents[i])
    print()

[[3817    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
Chinese Sentence            :  她很可爱。
Predicted English Translation:  she is very pretty 
Correct English Translation  :  She is very pretty.

