# There are some tips to learn
## It is actually harder to learn compare to vanilla seq2seq
* Add dropout
  * because we are adding some more layers
* Remove `initial_state=hidden`, get all information purely from attention layers
* Add `clipnorm`
* Change `learning_rate` is the most important thing
* Add masks to AdditiveAttention
  * If we have mask, it will learn better

# NOTE
* After I training the seq2seq model with different `flavors` and `options`, all those smaller tiny changes did help.
  * If there is NO model just learn it anyway eventually, most of the time, the model just doesn't learn...

In [1]:
! ls ../../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [2]:
! pip3 install jieba

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Layer, Dropout, Embedding
import numpy as np
import random
import jieba

batch_size = 16  
epochs = 15  
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 2000  # Number of samples to train on.
data_path = '../../data/machine_translation/cmn.txt'

input_texts = []
target_texts = []
input_tokens = set()
target_tokens = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# for line in lines[: min(num_samples, len(lines) - 1)]:
for line in random.sample(lines, num_samples):
    input_text, target_text = line.split('\t')
    
    # We use "\t" as the "start sequence" and "\n" as "end sequence"
    target_text = '\t' + target_text + '\n'
    
    tmp = []
    for token in input_text.split(" "):
        token = token.replace(",", "").replace(".", "").replace("?", "").lower()
        tmp.append(token)
        if token not in input_tokens:
            input_tokens.add(token)
    input_texts.append(tmp)
    
    tmp = []
    for token in jieba.cut(target_text, cut_all=False):
        tmp.append(token)
        if token not in target_tokens:
            target_tokens.add(token)
    target_texts.append(tmp)

input_tokens = list(input_tokens)
target_tokens = list(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

print("\ninput data set:")
print(input_texts[:10])
print("\ntarget data set:")
print(target_texts[:10])

2025-06-23 10:31:13.491518: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-23 10:31:13.491636: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.231 seconds.
Prefix dict has been built successfully.


Number of samples: 2000
Number of unique input tokens: 2198
Number of unique output tokens: 3211
Max sequence length for inputs: 32
Max sequence length for outputs: 32

input data set:
[['keep', 'off', 'the', 'grass'], ['mom', 'is', 'getting', 'dinner', 'ready'], ["let's", 'play', 'soccer'], ['tom', "can't", 'go', 'home', 'until', 'after', '2:30'], ['i', 'will', 'wait', 'here', 'till', 'he', 'comes'], ['put', 'the', 'chair', 'in', 'front', 'of', 'the', 'desk'], ['a', 'cafeteria', 'is', 'a', 'self-service', 'style', 'restaurant'], ['tom', 'encouraged', 'his', 'son', 'to', 'study', 'french'], ['i', 'was', 'surprised', 'to', 'see', 'a', 'lion'], ['where', 'is', 'the', 'nearest', 'police', 'station']]

target data set:
[['\t', '不要', '踩', '草地', '。', '\n'], ['\t', '媽媽', '快', '把', '晚餐', '準備', '好', '了', '。', '\n'], ['\t', '去', '踢足球', '吧', '。', '\n'], ['\t', '汤姆', '在', '2', '：', '30', '之后', '才能', '回家', '。', '\n'], ['\t', '我會', '在', '這裡', '等', '直到', '他來', '。', '\n'], ['\t', '把', '椅子', '放在', '桌子'

In [2]:
input_token_index = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("\ninput index:")
print(input_token_index)

print("\ntarget index:")
print(target_token_index)


input index:
{'carry': 0, 'sending': 1, 'smell': 2, 'kilometers': 3, 'turning': 4, 'seeing': 5, 'men': 6, "we'll": 7, 'open': 8, 'kobe': 9, 'watched': 10, 'happy"': 11, 'subject': 12, "he's": 13, 'memory': 14, 'experience': 15, 'discussed': 16, 'wet': 17, 'smile': 18, 'vacation': 19, 'greater': 20, 'realize': 21, 'respect': 22, 'supplies': 23, 'attended': 24, 'living': 25, 'ordinary': 26, 'chair': 27, 'six': 28, 'sitting': 29, 'greeted': 30, 'elephants': 31, 'thrilling': 32, 'girlfriend': 33, 'disaster': 34, 'changes': 35, 'rotten': 36, 'clothing': 37, 'committee': 38, 'competed': 39, 'disappoint': 40, 'worker': 41, 'work': 42, "isn't": 43, 'leaving': 44, 'confused': 45, 'twin': 46, 'tickets': 47, 'over': 48, 'otaru': 49, 'husband': 50, 'miss': 51, 'intruded': 52, 'fireworks': 53, 'email': 54, 'throat': 55, 'half': 56, 'pictures': 57, "mozart's": 58, 'color': 59, 'given': 60, 'cat': 61, 'today': 62, 'classroom': 63, 'delicious': 64, 'oil': 65, 'cake': 66, 'slowly': 67, 'boys': 68, 'tr

In [3]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[token]
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[token]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

print("The shape is (batch_size, time_steps, input_dim)")
print("so it will be (number of x, length of each x, dimension of x)")
print("\nencoder input shape:")
print(encoder_input_data.shape)
print("\ndecoder input shape:")
print(decoder_input_data.shape)
print("\ndecoder target shape:")
print(decoder_target_data.shape)
            
print("\nencoder input data (1-hot):")
print(encoder_input_data[-2:])
print("\ndecoder input data (1-hot):")
print(decoder_input_data[-2:])
print("\ndecoder target data (1-hot):")
print(decoder_target_data[-2:])

The shape is (batch_size, time_steps, input_dim)
so it will be (number of x, length of each x, dimension of x)

encoder input shape:
(2000, 32)

decoder input shape:
(2000, 32)

decoder target shape:
(2000, 32, 3211)

encoder input data (1-hot):
[[1842.  363. 1951. 1025.  821.  719.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]
 [1988. 1410.  766.  677.  772. 1380.  668.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]]

decoder input data (1-hot):
[[ 734. 2779. 2341.    0. 2665.   30. 1502.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]
 [ 734. 3034. 1917. 3034. 2545.  105.   30. 1502.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    

In [6]:
class Encoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(self.latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform',
                        dropout=0.2,
                        recurrent_dropout=0.2)
        self.embedding = Embedding(vocab_size, latent_dim, mask_zero=True)

    def call(self, x):
        print(f"x: {x.shape}")
        x = self.embedding(x)
        output, state = self.lstm(x)
        return output, state, x._keras_mask


encoder_inputs = Input(shape=(None, ))
encoder = Encoder(num_encoder_tokens, latent_dim, batch_size)
encoder_outputs, state, encoder_embedding_mask = encoder(encoder_inputs)

print(f"encoder_inputs: {encoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"encoder_outputs: {encoder_outputs.shape}")

x: (None, None)
encoder_inputs: (None, None)
state: (None, 128)
encoder_outputs: (None, None, 128)


# If we don't use `W1` and `W2`, we will get a better result

In [11]:
import tensorflow as tf

class BahdanauAttention2(Layer):
    def __init__(self, units):
        super().__init__()
        
        self.W1 = Dense(units, activation='sigmoid')
        self.W2 = Dense(units, activation='sigmoid')
        self.attention = tf.keras.layers.AdditiveAttention(dropout=0.2)
        
    def call(self, query, value, mask, encoder_mask):
        print(f"mask: {mask.shape}")
        print(f"encoder_mask: {encoder_mask.shape}")
        # From Eqn. (4), `W1@ht`.
        # w1_query = self.W1(query)
        # w1_query = Dropout(0.2)(w1_query)

        # From Eqn. (4), `W2@hs`.
        # from memory to key
        # w2_key = self.W2(value)
        # w2_key = Dropout(0.2)(w2_key)
        
        context_vector, attention_weights = self.attention(
            inputs = [query, value],
            mask=[mask, encoder_mask],
            return_attention_scores = True,
        )

        return context_vector, attention_weights

# We can also try not to use hidden in Decoder (i.e., remove `initial_state=hidden`), all information comes from attention layers

In [13]:
class Decoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform',
                        dropout=0.2,
                        recurrent_dropout=0.2)
        self.embedding = Embedding(vocab_size, latent_dim, mask_zero=True)
        self.fc = Dense(vocab_size, activation='softmax')
        self.attention = BahdanauAttention2(self.latent_dim)

    def call(self, x, hidden, enc_output, encoder_mask):
        print(f"x: {x.shape}")
        print(f"hidden: {hidden.shape}")
        print(f"enc_output: {enc_output.shape}")
        
        x = self.embedding(x)
        rnn_output, rnn_state = self.lstm(x)
        print(f"rnn_output: {rnn_output.shape}")
        
        context_vector, attention_weights = self.attention(rnn_output, enc_output, x._keras_mask, encoder_mask)
        
        # context_and_rnn_output = tf.concat([context_vector, rnn_output], axis=-1)
        
        x = tf.concat([context_vector, rnn_output], axis=-1)
        
        #NOTE: does reshape needed?
        # output = tf.reshape(output, (-1, output.shape[2]))
        y = self.fc(x)

        return y, rnn_state


decoder_inputs = Input(shape=(None, ))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder = Decoder(num_decoder_tokens, latent_dim, batch_size)
decoder_outputs, _ = decoder(decoder_inputs, hidden=state, enc_output=encoder_outputs, encoder_mask=encoder_embedding_mask)


print(f"decoder_inputs: {decoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"decoder_outputs: {decoder_outputs.shape}")

x: (None, None)
hidden: (None, 128)
enc_output: (None, None, 128)
rnn_output: (None, None, 128)
mask: (None, None)
encoder_mask: (None, None)
decoder_inputs: (None, None)
state: (None, 128)
decoder_outputs: (None, None, 3211)


In [14]:
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
    # NOTE: this learning rate is for at least 16,000 sample on GPU server...
    # initial_learning_rate=0.005,
    # decay_steps=3200,
    initial_learning_rate=0.001,
    decay_steps=200,
    decay_rate=0.9
)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler, clipnorm=1.0), loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_1 (Encoder)             ((None, None, 128),  380416      input_2[0][0]                    
__________________________________________________________________________________________________
decoder_3 (Decoder)             ((None, None, 3211), 1335435     input_6[0][0]                    
                                                                 encoder_1[0][0]              

In [15]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
        batch_size=batch_size,
        epochs=3,
        validation_split=0.2)

2025-06-23 10:47:18.308829: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 657612800 exceeds 10% of free system memory.
2025-06-23 10:47:22.544979: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2025-06-23 10:47:22.651525: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2198200000 Hz


Epoch 1/3
x: (16, 32)
x: (16, 32)
hidden: (16, 128)
enc_output: (16, 32, 128)
rnn_output: (16, 32, 128)
mask: (16, 32)
encoder_mask: (16, 32)
x: (16, 32)
x: (16, 32)
hidden: (16, 128)
enc_output: (16, 32, 128)
rnn_output: (16, 32, 128)
mask: (16, 32)
encoder_mask: (16, 32)
x: (16, 32)
hidden: (16, 128)
enc_output: (16, 32, 128)
rnn_output: (16, 32, 128)
mask: (16, 32)
encoder_mask: (16, 32)
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f920eca4c10>

In [21]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [22]:
encoder_model = Model(encoder_inputs, [encoder_outputs, state, encoder_embedding_mask])
encoder_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
encoder_1 (Encoder)          ((None, None, 128), (None 380416    
Total params: 380,416
Trainable params: 380,416
Non-trainable params: 0
_________________________________________________________________


In [23]:
decoder_state_input = Input(shape=(latent_dim,))
the_encoder_output = Input(shape=(None, latent_dim,))
the_encoder_mask = Input(shape=(None,), dtype=tf.bool)

decoder_outputs, decoder_state = decoder(
    decoder_inputs, hidden=decoder_state_input, enc_output=the_encoder_output, encoder_mask=the_encoder_mask)

decoder_model = Model(
    [decoder_inputs] + [decoder_state_input] + [the_encoder_output] + [the_encoder_mask],
    [decoder_outputs] + [decoder_state])

decoder_model.summary()

x: (None, None)
hidden: (None, 128)
enc_output: (None, None, 128)
rnn_output: (None, None, 128)
mask: (None, None)
encoder_mask: (None, None)
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, None, 128)]  0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 128)]    

In [24]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    encoder_output_value, states_value, mask_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['\t']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        print(f"target_seq: {target_seq}")
        
        output_tokens, the_decoder_state = decoder_model.predict(
            [target_seq] + [states_value] + [encoder_output_value] + [mask_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        #NOTE: maybe another way to do it???? maybe not
        # target_seq = np.append(target_seq, [[sampled_token_index]], axis=1)

        # Update states
        states_value = the_decoder_state

    return decoded_sentence


for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

x: (None, 32)
target_seq: [[734.]]
x: (None, 1)
hidden: (None, 128)
enc_output: (None, 32, 128)
rnn_output: (None, 1, 128)
mask: (None, 1)
encoder_mask: (None, 32)
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
-
Input sentence: ['keep', 'off', 'the', 'grass']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我
target_seq: [[734.]]
target_se

target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
target_seq: [[1763.]]
-
Input sentence: ['where', 'is', 'the', 'nearest', 'police', 'station']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我我
