In [1]:
! ls ../../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [2]:
! pip3 install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=ed27d19a84f32bfb6048db6e791987820e2c52697ae582f47ab1202f2dc61d03
  Stored in directory: /root/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Layer
import numpy as np
import random
import jieba

batch_size = 16  
epochs = 35  
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 2000  # Number of samples to train on.
data_path = '../../data/machine_translation/cmn.txt'

input_texts = []
target_texts = []
input_tokens = set()
target_tokens = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# for line in lines[: min(num_samples, len(lines) - 1)]:
for line in random.sample(lines, num_samples):
    input_text, target_text = line.split('\t')
    
    # We use "\t" as the "start sequence" and "\n" as "end sequence"
    target_text = '\t' + target_text + '\n'
    
    tmp = []
    for token in input_text.split(" "):
        token = token.replace(",", "").replace(".", "").replace("?", "").lower()
        tmp.append(token)
        if token not in input_tokens:
            input_tokens.add(token)
    input_texts.append(tmp)
    
    tmp = []
    for token in jieba.cut(target_text, cut_all=False):
        tmp.append(token)
        if token not in target_tokens:
            target_tokens.add(token)
    target_texts.append(tmp)

input_tokens = list(input_tokens)
target_tokens = list(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

print("\ninput data set:")
print(input_texts[:10])
print("\ntarget data set:")
print(target_texts[:10])

Number of samples: 2000
Number of unique input tokens: 2158
Number of unique output tokens: 3223
Max sequence length for inputs: 21
Max sequence length for outputs: 25

input data set:
[['there', 'are', 'four', 'seasons', 'in', 'one', 'year'], ['she', 'really', 'wants', 'to', 'go'], ['he', 'kept', 'his', 'promise', 'and', 'helped', 'his', 'brothers'], ["we're", 'out', 'of', 'tissue', 'paper', 'so', "i've", 'got', 'to', 'go', 'buy', 'some'], ['frankly', 'speaking', 'i', "don't", 'agree', 'with', 'you'], ['tom', 'was', 'able', 'to', 'make', 'himself', 'understood', 'in', 'french', 'when', 'he', 'visited', 'paris'], ["what's", 'your', 'favorite', 'game', 'to', 'play', 'with', 'friends'], ['i', 'warned', 'him', 'about', 'the', 'danger'], ['the', 'fish', 'he', 'caught', 'yesterday', 'is', 'still', 'alive'], ['they', 'like', 'apples']]

target data set:
[['\t', '一年', '有', '四季', '。', '\n'], ['\t', '她', '特别', '想', '去', '。', '\n'], ['\t', '他', '履行', '了', '他', '的', '承诺', '，', '并且', '帮助', '了', '他

In [15]:
input_token_index = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("\ninput index:")
print(input_token_index)

print("\ntarget index:")
print(target_token_index)


input index:
{'hurt': 0, 'everyone': 1, 'shut': 2, 'year!': 3, 'discovered': 4, 'while': 5, 'woman': 6, '1000': 7, 'collection': 8, 'goods': 9, 'mirror': 10, 'drink': 11, 'ten-dollar': 12, 'anything': 13, 'moved': 14, 'facebook': 15, 'second': 16, 'flows': 17, 'over': 18, 'worse': 19, 'nobody': 20, 'building': 21, 'chair': 22, 'oranges': 23, 'big': 24, 'five': 25, 'scientists': 26, 'lots': 27, 'cooked': 28, 'me"': 29, 'la': 30, 'tells': 31, 'location': 32, 'started': 33, 'unaware': 34, 'ladder': 35, 'enemies': 36, 'no!': 37, 'idea"': 38, 'actually': 39, 'knows': 40, 'skate': 41, 'service': 42, "isn't": 43, 'england': 44, 'wonder': 45, 'my': 46, 'lives': 47, 'choices': 48, 'leaves': 49, 'ac': 50, 'difference': 51, 'salary': 52, 'tragic': 53, 'divorced': 54, 'almost': 55, 'relies': 56, 'wake': 57, 'salty': 58, 'dogs': 59, 'taxes': 60, "who's": 61, 'childhood': 62, 'eagle': 63, "would've": 64, 'population': 65, 'yen': 66, 'suit': 67, 'clear': 68, 'one': 69, 'bread': 70, 'fuels': 71, 'mas

In [16]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[token]] = 1.
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[token]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

print("The shape is (batch_size, time_steps, input_dim)")
print("so it will be (number of x, length of each x, dimension of x)")
print("\nencoder input shape:")
print(encoder_input_data.shape)
print("\ndecoder input shape:")
print(decoder_input_data.shape)
print("\ndecoder target shape:")
print(decoder_target_data.shape)
            
print("\nencoder input data (1-hot):")
print(encoder_input_data[-2:])
print("\ndecoder input data (1-hot):")
print(decoder_input_data[-2:])
print("\ndecoder target data (1-hot):")
print(decoder_target_data[-2:])

The shape is (batch_size, time_steps, input_dim)
so it will be (number of x, length of each x, dimension of x)

encoder input shape:
(2000, 21, 2158)

decoder input shape:
(2000, 25, 3223)

decoder target shape:
(2000, 25, 3223)

encoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder target data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
 

In [17]:
class Encoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(self.latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')

    def call(self, x):
        output, state = self.lstm(x)
        return output, state


encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = Encoder(num_encoder_tokens, latent_dim, batch_size)
encoder_outputs, state = encoder(encoder_inputs)

print(f"encoder_inputs: {encoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"encoder_outputs: {encoder_outputs.shape}")

encoder_inputs: (None, None, 2158)
state: (None, 128)
encoder_outputs: (None, None, 128)


In [34]:
import tensorflow as tf

class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        # query is the decoder's current hidden state (h_t)
        # values are the encoder's output states (h_s from s=1 to T_x)

        # query hidden state shape == (batch_size, hidden size)
        # We need to expand dims for broadcasting with values
        query_with_time_axis = tf.expand_dims(query, 1) # (batch_size, 1, hidden size)
        print(f"query_with_time_axis: {query_with_time_axis.shape}")

        # Calculate the score (alignment scores)
        # score shape == (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        print(f"score: {score.shape}")

        # Apply softmax to get attention weights
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        print(f"score: {attention_weights.shape}")

        # Calculate the context vector
        # context_vector shape == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        print(f"score: {context_vector.shape}")

        return context_vector, attention_weights

In [35]:
class Decoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')
        self.fc = Dense(vocab_size, activation='softmax')
        self.attention = BahdanauAttention(self.latent_dim)

    def call(self, x, hidden, enc_output):
        print(f"x: {x.shape}")
        print(f"hidden: {hidden.shape}")
        print(f"enc_output: {enc_output.shape}")
        
        context_vector, attention_weights = self.attention(hidden, enc_output)
        print(f"context_vector: {context_vector.shape}")
        print(f"attention_weights: {attention_weights.shape}")
        
        x, state = self.lstm(x, initial_state=hidden)
        # x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        #NOTE: does reshape needed?
        # output = tf.reshape(output, (-1, output.shape[2]))
        y = self.fc(x)

        return y, state


decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder = Decoder(num_decoder_tokens, latent_dim, batch_size)
decoder_outputs, _ = decoder(decoder_inputs, hidden=state, enc_output=encoder_outputs)


print(f"decoder_inputs: {decoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"decoder_outputs: {decoder_outputs.shape}")

x: (None, None, 3223)
hidden: (None, 128)
enc_output: (None, None, 128)
query_with_time_axis: (None, 1, 128)
score: (None, None, 1)
score: (None, None, 1)
score: (None, 128)
context_vector: (None, 128)
attention_weights: (None, None, 1)
decoder_inputs: (None, None, 3223)
state: (None, 128)
decoder_outputs: (None, None, 3223)


In [36]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None, 2158)] 0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, None, 3223)] 0                                            
__________________________________________________________________________________________________
encoder_1 (Encoder)             ((None, None, 128),  878592      input_5[0][0]                    
__________________________________________________________________________________________________
decoder_9 (Decoder)             ((None, None, 3223), 1736472     input_12[0][0]                   
                                                                 encoder_1[0][0]            

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2)

Epoch 1/35
x: (16, 25, 3223)
hidden: (16, 128)
enc_output: (16, 21, 128)
query_with_time_axis: (16, 1, 128)
score: (16, 21, 1)
score: (16, 21, 1)
score: (16, 128)
context_vector: (16, 128)
attention_weights: (16, 21, 1)
x: (16, 25, 3223)
hidden: (16, 128)
enc_output: (16, 21, 128)
query_with_time_axis: (16, 1, 128)
score: (16, 21, 1)
score: (16, 21, 1)
score: (16, 128)
context_vector: (16, 128)
attention_weights: (16, 21, 1)
hidden: (16, 128)
enc_output: (16, 21, 128)
query_with_time_axis: (16, 1, 128)
score: (16, 21, 1)
score: (16, 21, 1)
score: (16, 128)
context_vector: (16, 128)
attention_weights: (16, 21, 1)
Epoch 2/35
Epoch 3/35

In [30]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
# encoder_model = Model(encoder_inputs, encoder_states)

# decoder_state_input_h = Input(shape=(latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_outputs, state_h, state_c = decoder(
#     decoder_inputs, hidden=decoder_states_inputs)
# decoder_states = [state_h, state_c]
# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs] + decoder_states)

In [42]:
encoder_model = Model(encoder_inputs, state)

decoder_state_input = Input(shape=(latent_dim,))
decoder_outputs, decoder_state = decoder(
    decoder_inputs, hidden=decoder_state_input)

decoder_model = Model(
    [decoder_inputs] + [decoder_state_input],
    [decoder_outputs] + [decoder_state])

In [44]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, the_decoder_state = decoder_model.predict(
            [target_seq] + [states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = the_decoder_state

    return decoded_sentence


for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: ['a', 'string', 'of', 'disasters', 'struck', 'the', 'region']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['tom', 'has', 'been', 'under', 'a', 'great', 'deal', 'of', 'strain', 'lately']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['tom', 'was', 'questioned', 'by', 'the', 'police']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['i', "can't", 'keep', 'up', 'with', 'tom']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['do', 'what', 'you', 'think', 'is', 'right']
Decoded sentence: 。。。我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['this', 'is', 'all', 'i', 'know']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['what', 'tom', 'said', "doesn't", 'apply', 'in', 'this', 'situation']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['he', 'differs', 'from', 'his', 'brother']
Decoded sentence: 。。我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['i', 'prefer', 'milk', 'to', 'juice']
Decoded sentence: 我我我我我我我我我我我