In [1]:
! ls ../../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [2]:
! pip3 install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=ed27d19a84f32bfb6048db6e791987820e2c52697ae582f47ab1202f2dc61d03
  Stored in directory: /root/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Layer
import numpy as np
import random
import jieba

batch_size = 16  
epochs = 35  
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 2000  # Number of samples to train on.
data_path = '../../data/machine_translation/cmn.txt'

input_texts = []
target_texts = []
input_tokens = set()
target_tokens = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# for line in lines[: min(num_samples, len(lines) - 1)]:
for line in random.sample(lines, num_samples):
    input_text, target_text = line.split('\t')
    
    # We use "\t" as the "start sequence" and "\n" as "end sequence"
    target_text = '\t' + target_text + '\n'
    
    tmp = []
    for token in input_text.split(" "):
        token = token.replace(",", "").replace(".", "").replace("?", "").lower()
        tmp.append(token)
        if token not in input_tokens:
            input_tokens.add(token)
    input_texts.append(tmp)
    
    tmp = []
    for token in jieba.cut(target_text, cut_all=False):
        tmp.append(token)
        if token not in target_tokens:
            target_tokens.add(token)
    target_texts.append(tmp)

input_tokens = list(input_tokens)
target_tokens = list(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

print("\ninput data set:")
print(input_texts[:10])
print("\ntarget data set:")
print(target_texts[:10])

Number of samples: 2000
Number of unique input tokens: 2185
Number of unique output tokens: 3223
Max sequence length for inputs: 17
Max sequence length for outputs: 22

input data set:
[['a', 'string', 'of', 'disasters', 'struck', 'the', 'region'], ['tom', 'has', 'been', 'under', 'a', 'great', 'deal', 'of', 'strain', 'lately'], ['tom', 'was', 'questioned', 'by', 'the', 'police'], ['i', "can't", 'keep', 'up', 'with', 'tom'], ['do', 'what', 'you', 'think', 'is', 'right'], ['this', 'is', 'all', 'i', 'know'], ['what', 'tom', 'said', "doesn't", 'apply', 'in', 'this', 'situation'], ['he', 'differs', 'from', 'his', 'brother'], ['i', 'prefer', 'milk', 'to', 'juice'], ['it', 'is', 'wrong', 'to', 'cheat', 'at', 'cards']]

target data set:
[['\t', '一連串', '的', '災難', '襲擊', '這個', '地區', '。', '\n'], ['\t', '汤姆', '最近', '压力', '特别', '大', '。', '\n'], ['\t', '汤姆', '被', '警察', '盘问', '。', '\n'], ['\t', '我', '跟不上', '汤姆', '。', '\n'], ['\t', '做', '你', '認為', '正確', '的', '事', '。', '\n'], ['\t', '這是', '我', '所', '知道'

In [11]:
input_token_index = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("\ninput index:")
print(input_token_index)

print("\ntarget index:")
print(target_token_index)


input index:

target index:
{'棍子': 0, '云': 1, '空闲': 2, '狗叫': 3, '中': 4, '自由式': 5, '购物': 6, '參加': 7, '比起': 8, '那輛': 9, '而': 10, '地讀': 11, '父亲': 12, '加拿大人': 13, '不定': 14, '有票': 15, '青筋': 16, '規則': 17, '問': 18, '主人': 19, '婚姻': 20, '不吃': 21, '真的': 22, '洽商': 23, '數的': 24, '多少': 25, '25': 26, '汤': 27, '儥': 28, '創造': 29, '我们': 30, '多': 31, '钱': 32, '抱歉': 33, '禮貌': 34, '太郎': 35, '带给': 36, '見': 37, '变得': 38, '同意': 39, '新西裝': 40, '洗澡': 41, '前天': 42, '了車': 43, '没': 44, '吸塵器': 45, '變得': 46, '老师': 47, '草莓': 48, '好觉': 49, '雨衣': 50, 'Tom': 51, '小偷': 52, '小孩子': 53, '刚够': 54, '我会': 55, '複': 56, '气氛': 57, '公寓': 58, '交易': 59, '男人': 60, '同樣': 61, '椅上': 62, '两个': 63, '圖書館裡': 64, '上判斷': 65, '不動': 66, '在羅馬': 67, '開手': 68, '冬天': 69, '合身': 70, '鎖大門': 71, '最好': 72, '感情': 73, '喉嚨': 74, '掉': 75, '气压计': 76, '屋里': 77, '被捕': 78, '主因': 79, '小路': 80, '我媽媽': 81, '咬': 82, '成為': 83, '一直': 84, '终生': 85, '打得': 86, '認為': 87, '給我點': 88, '这可真': 89, '大學生': 90, '忘': 91, '死后': 92, '送行': 93, '警察': 94, '寄给': 95, '洞': 96, '便宜': 97

In [12]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[token]] = 1.
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[token]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

print("The shape is (batch_size, time_steps, input_dim)")
print("so it will be (number of x, length of each x, dimension of x)")
print("\nencoder input shape:")
print(encoder_input_data.shape)
print("\ndecoder input shape:")
print(decoder_input_data.shape)
print("\ndecoder target shape:")
print(decoder_target_data.shape)
            
print("\nencoder input data (1-hot):")
print(encoder_input_data[-2:])
print("\ndecoder input data (1-hot):")
print(decoder_input_data[-2:])
print("\ndecoder target data (1-hot):")
print(decoder_target_data[-2:])

The shape is (batch_size, time_steps, input_dim)
so it will be (number of x, length of each x, dimension of x)

encoder input shape:
(2000, 17, 2185)

decoder input shape:
(2000, 22, 3223)

decoder target shape:
(2000, 22, 3223)

encoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder target data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
 

In [13]:
class Encoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(self.latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')

    def call(self, x):
        output, state = self.lstm(x)
        return output, state


encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = Encoder(num_encoder_tokens, latent_dim, batch_size)
encoder_outputs, state = encoder(encoder_inputs)

print(f"encoder_inputs: {encoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"encoder_outputs: {encoder_outputs.shape}")

encoder_inputs: (None, None, 2185)
state: (None, 128)
encoder_outputs: (None, None, 128)


In [27]:
class Decoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')
        self.fc = Dense(vocab_size)

    def call(self, x, hidden):
        # context_vector, attention_weights = self.attention(hidden, enc_output)
        # x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.lstm(x, initial_state=hidden)
        # output = tf.reshape(output, (-1, output.shape[2]))
        y = self.fc(output)

        return y, state


decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder = Decoder(num_decoder_tokens, latent_dim, batch_size)
decoder_outputs, _ = decoder(decoder_inputs, hidden=state)


print(f"decoder_inputs: {decoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"decoder_outputs: {decoder_outputs.shape}")

decoder_inputs: (None, None, 3223)
state: (None, 128)
decoder_outputs: (None, None, 3223)


In [28]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, None, 2185)] 0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, None, 3223)] 0                                            
__________________________________________________________________________________________________
encoder_2 (Encoder)             ((None, None, 128),  888960      input_6[0][0]                    
__________________________________________________________________________________________________
decoder_6 (Decoder)             ((None, None, 3223), 1703319     input_12[0][0]                   
                                                                 encoder_2[0][1]            

In [29]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<tensorflow.python.keras.callbacks.History at 0x7f7e774e6ad0>

In [30]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
# encoder_model = Model(encoder_inputs, encoder_states)

# decoder_state_input_h = Input(shape=(latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_outputs, state_h, state_c = decoder(
#     decoder_inputs, hidden=decoder_states_inputs)
# decoder_states = [state_h, state_c]
# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs] + decoder_states)

In [42]:
encoder_model = Model(encoder_inputs, state)

decoder_state_input = Input(shape=(latent_dim,))
decoder_outputs, decoder_state = decoder(
    decoder_inputs, hidden=decoder_state_input)

decoder_model = Model(
    [decoder_inputs] + [decoder_state_input],
    [decoder_outputs] + [decoder_state])

In [44]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, the_decoder_state = decoder_model.predict(
            [target_seq] + [states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = the_decoder_state

    return decoded_sentence


for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: ['a', 'string', 'of', 'disasters', 'struck', 'the', 'region']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['tom', 'has', 'been', 'under', 'a', 'great', 'deal', 'of', 'strain', 'lately']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['tom', 'was', 'questioned', 'by', 'the', 'police']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['i', "can't", 'keep', 'up', 'with', 'tom']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['do', 'what', 'you', 'think', 'is', 'right']
Decoded sentence: 。。。我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['this', 'is', 'all', 'i', 'know']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['what', 'tom', 'said', "doesn't", 'apply', 'in', 'this', 'situation']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['he', 'differs', 'from', 'his', 'brother']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['i', 'prefer', 'milk', 'to', 'juice']
Decoded sentence: 我我我我我我我我我我我