#  Tutorial of Simple Seq2seq with Teacher forcing

### 1) Read Data

reference : https://www.tensorflow.org/tutorials/text/nmt_with_attention?hl=zh_cn

In [1]:
import json
import io
import re
import os
import time
import random
import numpy as np
import unicodedata
import tensorflow as tf
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

### 2) Text PreProcessing

#### （1） Text Cleaning

In [3]:
# 将 unicode 文件转换为 ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [4]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"}

In [5]:
def preprocess_sentence(sentence):
    
    sentence = unicode_to_ascii(sentence.lower().strip())
    
    sentence = " ".join([contractions[word] if word in contractions else word for word in sentence.split(' ') ][:-1])
    
    # 在单词与跟在其后的标点符号之间插入一个空格
    # 例如： "he is a boy." => "he is a boy ."
    # 参考：https://stackoverflosentence.com/questions/3645931/python-padding-punctuation-sentenceith-sentencehite-spaces-keeping-punctuation
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)

    # 除了 (a-z, A-Z, ".", "?", "!", ",")，将所有字符替换为空格
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)

    sentence = sentence.rstrip().strip()

    # 给句子加上开始和结束标记
    # 以便模型知道何时开始和结束预测
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [6]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this <end>
b'<start> \xc2\xbf puedo tomar prestado este <end>'


In [7]:
# 1. 去除重音符号
# 2. 清理句子
# 3. 返回这样格式的单词对：[ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')[:-1]

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [8]:
en, sp = create_dataset(path_to_file, None)

In [9]:
print(en[20000])
print(sp[20000])

<start> i do not like <end>
<start> no me gustan las <end>


#### (2) sentence tokenizing

In [10]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    
    return tensor, lang_tokenizer

# def load_dataset(path, num_examples=None):
#     # 创建清理过的输入输出对
#     targ_lang, inp_lang = create_dataset(path, num_examples)

#     input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
#     target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

#     return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
    # 创建清理过的输入输出对
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [12]:
# 尝试实验不同大小的数据集
num_examples = 30000
input_tensor, output_tensor, input_tokenizer, output_tokenizer = load_dataset(path_to_file, num_examples)

# 计算目标张量的最大长度 （max_length）
max_length_targ, max_length_inp = max_length(output_tensor), max_length(input_tensor)

In [13]:
max_length_output, max_length_input = max_length(output_tensor), max_length(input_tensor)

In [14]:
# 采用 80 - 20 的比例切分训练集和验证集
input_tensor_train, input_tensor_val, output_tensor_train, output_tensor_val = train_test_split(input_tensor, output_tensor, test_size=0.2)

# 显示长度
print(len(input_tensor_train), len(output_tensor_train), len(input_tensor_val), len(output_tensor_val))

24000 24000 6000 6000


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
input_tensor_train[1000]

array([  1,  87, 392,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0])

In [17]:
print ("Input Language; index to word mapping")
convert(input_tokenizer, input_tensor_train[7000])
print ()
print ("Target Language; index to word mapping")
convert(output_tokenizer, output_tensor_train[7000])

Input Language; index to word mapping
1 ----> <start>
5 ----> tom
15 ----> se
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
5 ----> tom
2 ----> <end>


#### (3) DataSet Creating

In [18]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, output_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [20]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 14]), TensorShape([64, 9]))

In [21]:
example_input_batch.shape

TensorShape([64, 14])

In [22]:
# example_input_batch

### 2) Encoder

##### 2) Model

In [23]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,encode_units):
        super(Encoder,self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encode_units = encode_units
        self.embedding = tf.keras.layers.Embedding(self.vocab_size,self.embedding_dim)
        self.gru = tf.keras.layers.GRU(units=self.encode_units,return_sequences=True,return_state=True)
        
    
    def call(self,x):
        encoder_embedding = self.embedding(x)
        encode_output,encode_hidden_state = self.gru(encoder_embedding)
        
        return encode_output,encode_hidden_state
        

In [24]:
INPUT_VOCAB_SIZE = len(input_tokenizer.word_index)+1
ENCODER_EMBEDDING_SIZE = 256
ENCODER_UNIT = 128

In [25]:
INPUT_VOCAB_SIZE

4916

In [26]:
encoder = Encoder(INPUT_VOCAB_SIZE,ENCODER_EMBEDDING_SIZE,ENCODER_UNIT)

In [27]:
encode_output,encode_hidden_state = encoder(example_input_batch)

In [28]:
encode_hidden_state

<tf.Tensor: id=159, shape=(64, 128), dtype=float32, numpy=
array([[ 0.06025203,  0.06018748,  0.00585309, ...,  0.03294687,
        -0.04083669,  0.0157035 ],
       [ 0.06031514,  0.06010835,  0.00571282, ...,  0.03294995,
        -0.0408518 ,  0.01564984],
       [ 0.06051484,  0.06023348,  0.00582696, ...,  0.03311078,
        -0.04075034,  0.01580523],
       ...,
       [ 0.06039887,  0.06022925,  0.00583461, ...,  0.03299372,
        -0.04072356,  0.01577699],
       [ 0.05995977,  0.0598466 ,  0.005825  , ...,  0.03229364,
        -0.04065358,  0.01509264],
       [ 0.06022097,  0.05999106,  0.00582924, ...,  0.03280593,
        -0.04075763,  0.01562667]], dtype=float32)>

### 3) Decoder

##### 2) Model

In [43]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,decode_unit,embedding_dim):
        super(Decoder,self).__init__()
        self.vocab_size = vocab_size
        self.decode_unit = decode_unit
        self.embedding_dim = embedding_dim
        
        ### structure
        self.gru =  tf.keras.layers.GRU(units=self.decode_unit,return_sequences=True,return_state=True)
        
        self.embeddding = tf.keras.layers.Embedding(self.vocab_size,self.embedding_dim)
        
        self.fc = tf.keras.layers.Dense(self.vocab_size)
        
        
        ### decode_input  【batch_size,word_index】
    def call(self,decode_input,encode_output):
        
        decode_input = self.embeddding(decode_input)
        
        shape = (decode_input.shape[0],encode_output.shape[1]-decode_input.shape[1],decode_input.shape[2])
        
        padding = tf.zeros(shape)
        
        decode_input = tf.concat([decode_input,padding],axis = 1)
        
        concat_vector = tf.concat([encode_output,decode_input,decode_input],axis = -1)
        
        decode_output,decode_hidden_state = self.gru(concat_vector)
        
        decode_output = tf.reduce_sum(decode_output,axis = 1)
        
        y = self.fc(decode_output)
        
        return y
        

In [44]:
### parameter
output_vocab_size = len(output_tokenizer.word_index)+1

DECODER_UNIT = 256

encode_embedding_dim = 256

In [45]:
output_vocab_size

2642

In [46]:
decoder = Decoder(output_vocab_size,DECODER_UNIT,encode_embedding_dim)

In [47]:
decode_input = example_target_batch[:, :1]
# decode_input = tf.convert_to_tensor([output_tokenizer.word_index['<start>']] * BATCH_SIZE)

In [48]:
# decode_input

In [49]:
predictions = decoder(decode_input,encode_output)

In [50]:
predictions

<tf.Tensor: id=549, shape=(64, 2642), dtype=float32, numpy=
array([[ 0.00443673, -0.01104843,  0.00246613, ..., -0.02069634,
        -0.04083196,  0.03339528],
       [ 0.00073739, -0.00400375,  0.01410341, ..., -0.01000342,
        -0.03755658,  0.04245183],
       [-0.0041207 , -0.01033499,  0.01618817, ..., -0.02738772,
        -0.04122337,  0.04896358],
       ...,
       [-0.02188004, -0.01317842,  0.00931021, ..., -0.02218042,
        -0.04914907,  0.03876209],
       [-0.01812549, -0.0052731 ,  0.00459114, ..., -0.02370732,
        -0.02571396,  0.03012667],
       [-0.00816171, -0.01248191,  0.00289739, ..., -0.01406125,
        -0.04600342,  0.0416571 ]], dtype=float32)>

### 4) Define Loss Function

In [51]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    
    loss_ *= mask

    return tf.reduce_mean(loss_)

### 5) Save Model

In [52]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### 6）Train Function

In [55]:
@tf.function
def train_step(inp,tar):
    
    loss = 0
    
    with tf.GradientTape() as tape:
    
        encode_output,encode_hidden_state = encoder(inp)

        decode_input = tf.convert_to_tensor([output_tokenizer.word_index['<start>']] * BATCH_SIZE)

#         end_index = tf.cast(tf.argmin(tf.reduce_sum(targ,axis = 0)),tf.int32)

        for t in range(1,tar.shape[1]):
        
            decode_input = targ[:,:t]
        
            predictions = decoder(decode_input,encode_output)

            loss += loss_function(tar[:,t],predictions)
        
    
    batch_loss = (loss / int(targ.shape[1]))
    
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss,variables)
    
    
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss
    
    

In [56]:
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

In [70]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
    # 每 2 个周期（epoch），保存（检查点）一次模型
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
        
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.5259
Epoch 1 Batch 100 Loss 0.5203
Epoch 1 Batch 200 Loss 0.3780
Epoch 1 Batch 300 Loss 0.3685
Epoch 2 Batch 0 Loss 0.4723
Epoch 2 Batch 100 Loss 0.4364
Epoch 2 Batch 200 Loss 0.3229
Epoch 2 Batch 300 Loss 0.3291


'./training_checkpoints\\ckpt-3'

Epoch 2 Loss 0.3300
Time taken for 1 epoch 92.37497138977051 sec

Epoch 3 Batch 0 Loss 0.4399
Epoch 3 Batch 100 Loss 0.3781
Epoch 3 Batch 200 Loss 0.2927
Epoch 3 Batch 300 Loss 0.3039
Epoch 4 Batch 0 Loss 0.3827
Epoch 4 Batch 100 Loss 0.3308
Epoch 4 Batch 200 Loss 0.2431
Epoch 4 Batch 300 Loss 0.2933


'./training_checkpoints\\ckpt-4'

Epoch 4 Loss 0.2632
Time taken for 1 epoch 93.03660202026367 sec

Epoch 5 Batch 0 Loss 0.3563
Epoch 5 Batch 100 Loss 0.2892
Epoch 5 Batch 200 Loss 0.2124
Epoch 5 Batch 300 Loss 0.2717
Epoch 6 Batch 0 Loss 0.3354
Epoch 6 Batch 100 Loss 0.2745
Epoch 6 Batch 200 Loss 0.1752
Epoch 6 Batch 300 Loss 0.2379


'./training_checkpoints\\ckpt-5'

Epoch 6 Loss 0.2128
Time taken for 1 epoch 92.72807574272156 sec

Epoch 7 Batch 0 Loss 0.3061
Epoch 7 Batch 100 Loss 0.2211
Epoch 7 Batch 200 Loss 0.1513
Epoch 7 Batch 300 Loss 0.2107
Epoch 8 Batch 0 Loss 0.2896
Epoch 8 Batch 100 Loss 0.2053
Epoch 8 Batch 200 Loss 0.1417
Epoch 8 Batch 300 Loss 0.2010


'./training_checkpoints\\ckpt-6'

Epoch 8 Loss 0.1776
Time taken for 1 epoch 92.68789458274841 sec

Epoch 9 Batch 0 Loss 0.2864
Epoch 9 Batch 100 Loss 0.1870
Epoch 9 Batch 200 Loss 0.1535
Epoch 9 Batch 300 Loss 0.1845
Epoch 10 Batch 0 Loss 0.2716
Epoch 10 Batch 100 Loss 0.1167
Epoch 10 Batch 200 Loss 0.1219
Epoch 10 Batch 300 Loss 0.1853


'./training_checkpoints\\ckpt-7'

Epoch 10 Loss 0.1530
Time taken for 1 epoch 92.4054696559906 sec

Epoch 11 Batch 0 Loss 0.2276
Epoch 11 Batch 100 Loss 0.1447
Epoch 11 Batch 200 Loss 0.0976
Epoch 11 Batch 300 Loss 0.1777
Epoch 12 Batch 0 Loss 0.2141
Epoch 12 Batch 100 Loss 0.1348
Epoch 12 Batch 200 Loss 0.0893
Epoch 12 Batch 300 Loss 0.1648


'./training_checkpoints\\ckpt-8'

Epoch 12 Loss 0.1347
Time taken for 1 epoch 91.73920321464539 sec

Epoch 13 Batch 0 Loss 0.2191
Epoch 13 Batch 100 Loss 0.1280
Epoch 13 Batch 200 Loss 0.0888
Epoch 13 Batch 300 Loss 0.1569
Epoch 14 Batch 0 Loss 0.1961
Epoch 14 Batch 100 Loss 0.1092
Epoch 14 Batch 200 Loss 0.0874
Epoch 14 Batch 300 Loss 0.1511


'./training_checkpoints\\ckpt-9'

Epoch 14 Loss 0.1223
Time taken for 1 epoch 85.86132621765137 sec

Epoch 15 Batch 0 Loss 0.2275
Epoch 15 Batch 100 Loss 0.1175
Epoch 15 Batch 200 Loss 0.0758
Epoch 15 Batch 300 Loss 0.1489
Epoch 16 Batch 0 Loss 0.2160
Epoch 16 Batch 100 Loss 0.1013
Epoch 16 Batch 200 Loss 0.0716
Epoch 16 Batch 300 Loss 0.1370


'./training_checkpoints\\ckpt-10'

Epoch 16 Loss 0.1052
Time taken for 1 epoch 91.47631859779358 sec

Epoch 17 Batch 0 Loss 0.2040
Epoch 17 Batch 100 Loss 0.0829
Epoch 17 Batch 200 Loss 0.0810
Epoch 17 Batch 300 Loss 0.1248
Epoch 18 Batch 0 Loss 0.1732
Epoch 18 Batch 100 Loss 0.0771
Epoch 18 Batch 200 Loss 0.0823
Epoch 18 Batch 300 Loss 0.1250


'./training_checkpoints\\ckpt-11'

Epoch 18 Loss 0.0982
Time taken for 1 epoch 84.03984785079956 sec

Epoch 19 Batch 0 Loss 0.1877
Epoch 19 Batch 100 Loss 0.0893
Epoch 19 Batch 200 Loss 0.0849
Epoch 19 Batch 300 Loss 0.1367
Epoch 20 Batch 0 Loss 0.2016
Epoch 20 Batch 100 Loss 0.0954
Epoch 20 Batch 200 Loss 0.0876
Epoch 20 Batch 300 Loss 0.1161


'./training_checkpoints\\ckpt-12'

Epoch 20 Loss 0.0932
Time taken for 1 epoch 91.63279891014099 sec



In [81]:
# 恢复检查点目录 （checkpoint_dir） 中最新的检查点
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21912183c88>

In [82]:
def evaluate(sentence):
#     attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [input_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_input,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    
#     encode_output,encode_hidden_state = encoder(inputs)
    enc_out, enc_hidden = encoder(inputs)

    dec_input = tf.expand_dims([output_tokenizer.word_index['<start>']],1)

    predicted_ids = [1]
    for t in range(max_length_targ):
        
        predictions = decoder(dec_input,enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()
        
        
        if output_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        else:
            if  predicted_ids[-1]!=predicted_id:
                predicted_ids.append(predicted_id)

        result = ' '.join([output_tokenizer.index_word[predicted_id] for predicted_id in predicted_ids])

        # 预测的 ID 被输送回模型
        dec_input = tf.expand_dims(predicted_ids,0)
        
    
    
    
    
    return result, sentence

In [83]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [84]:
translate(u'no me gustan las')

Input: <start> no me gustan <end>
Predicted translation: <start> i do not like


In [85]:
translate(u'¿todavia estan en casa?')

Input: <start> ¿ todavia estan en <end>
Predicted translation: <start> are you still


In [86]:
translate(u'trata de averiguarlo')

Input: <start> trata de <end>
Predicted translation: <start> try
