# There are some tips to learn
## It is actually harder to learn compare to vanilla seq2seq
* Add dropout
  * because we are adding some more layers
* Remove `initial_state=hidden`, get all information purely from attention layers
* Add `clipnorm`
* Change `learning_rate` is the most important thing

In [1]:
! ls ../../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [2]:
! pip3 install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=ed27d19a84f32bfb6048db6e791987820e2c52697ae582f47ab1202f2dc61d03
  Stored in directory: /root/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Layer, Dropout
import numpy as np
import random
import jieba

batch_size = 16  
epochs = 15  
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 2000  # Number of samples to train on.
data_path = '../../data/machine_translation/cmn.txt'

input_texts = []
target_texts = []
input_tokens = set()
target_tokens = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# for line in lines[: min(num_samples, len(lines) - 1)]:
for line in random.sample(lines, num_samples):
    input_text, target_text = line.split('\t')
    
    # We use "\t" as the "start sequence" and "\n" as "end sequence"
    target_text = '\t' + target_text + '\n'
    
    tmp = []
    for token in input_text.split(" "):
        token = token.replace(",", "").replace(".", "").replace("?", "").lower()
        tmp.append(token)
        if token not in input_tokens:
            input_tokens.add(token)
    input_texts.append(tmp)
    
    tmp = []
    for token in jieba.cut(target_text, cut_all=False):
        tmp.append(token)
        if token not in target_tokens:
            target_tokens.add(token)
    target_texts.append(tmp)

input_tokens = list(input_tokens)
target_tokens = list(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

print("\ninput data set:")
print(input_texts[:10])
print("\ntarget data set:")
print(target_texts[:10])

2025-06-25 13:54:48.276819: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-25 13:54:48.276927: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.176 seconds.
Prefix dict has been built successfully.


Number of samples: 2000
Number of unique input tokens: 2165
Number of unique output tokens: 3198
Max sequence length for inputs: 19
Max sequence length for outputs: 21

input data set:
[['i', 'sometimes', 'still', 'think', 'about', 'her'], ['if', 'i', 'had', 'one', 'million', 'yen', 'now', 'i', 'would', 'buy', 'a', 'car'], ['the', 'time', 'has', 'come', 'when', 'i', 'must', 'tell', 'you', 'the', 'truth'], ["it's", 'already', 'nine', "o'clock"], ['come', 'along', 'with', 'us'], ['there', 'was', 'a', 'castle', 'here', 'many', 'years', 'ago'], ['his', 'words', 'gave', 'me', 'hope'], ['i', 'wish', 'i', 'were', 'a', 'good', 'singer'], ['"he\'d', 'like', 'to', 'have', 'a', 'coffee', 'after', 'work"', '"i', 'would', 'too"'], ["you're", 'welcome']]

target data set:
[['\t', '有', '時候', '，', '我', '還是', '會', '想起', '她', '。', '\n'], ['\t', '如果', '我', '现在', '有', '100', '万日元', '，', '我会', '买', '辆车', '。', '\n'], ['\t', '現在', '是', '我', '必須', '告訴', '你', '真相', '的', '時候', '了', '。', '\n'], ['\t', '已经', '9',

In [2]:
input_token_index = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("\ninput index:")
print(input_token_index)

print("\ntarget index:")
print(target_token_index)


input index:

target index:
{'第一': 0, '一碗': 1, '上': 2, '两件事': 3, '學習': 4, '你': 5, '旁邊': 6, '彻夜': 7, '拼': 8, '贴': 9, '外国人': 10, '打开': 11, '進口': 12, '日子': 13, '裡有': 14, '政治': 15, '停住': 16, '認真': 17, '園': 18, '挑战': 19, '跳蚤': 20, '將去': 21, '照片': 22, '其中': 23, '完成': 24, '過去': 25, '舒服': 26, '蕃茄': 27, '裝滿水': 28, '西班牙': 29, '好几间': 30, '到': 31, '份': 32, '見過': 33, '信': 34, '去給': 35, '太': 36, '天才': 37, '寫': 38, '十月': 39, '寄': 40, '不怕死': 41, '酒吧': 42, '结束': 43, '事實': 44, '显著': 45, '猫': 46, '前搖動': 47, '那本書': 48, '相處': 49, '你講': 50, 'Facebook': 51, '彎': 52, '他會選': 53, '下山': 54, '旁边': 55, '電腦': 56, '至少': 57, '開心': 58, '沙發': 59, '出去': 60, '时间': 61, '放棄': 62, '使': 63, '讓': 64, '设法': 65, '一样': 66, '上衣': 67, '绝不会': 68, '臉': 69, '难以置信': 70, '星期': 71, '将会': 72, '還牙': 73, '那些': 74, '供认': 75, '湯姆': 76, '來': 77, '有點': 78, '印度': 79, '載': 80, '不起': 81, '歡辣': 82, '不耐烦': 83, '总是': 84, '我六點': 85, '了': 86, '事情': 87, '翰': 88, '旅館': 89, '待': 90, 'NTT': 91, '过去': 92, '打來': 93, '著': 94, '成员': 95, '此案': 96, '歌手': 97, '

In [3]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[token]] = 1.
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[token]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

print("The shape is (batch_size, time_steps, input_dim)")
print("so it will be (number of x, length of each x, dimension of x)")
print("\nencoder input shape:")
print(encoder_input_data.shape)
print("\ndecoder input shape:")
print(decoder_input_data.shape)
print("\ndecoder target shape:")
print(decoder_target_data.shape)
            
print("\nencoder input data (1-hot):")
print(encoder_input_data[-2:])
print("\ndecoder input data (1-hot):")
print(decoder_input_data[-2:])
print("\ndecoder target data (1-hot):")
print(decoder_target_data[-2:])

The shape is (batch_size, time_steps, input_dim)
so it will be (number of x, length of each x, dimension of x)

encoder input shape:
(2000, 19, 2165)

decoder input shape:
(2000, 21, 3198)

decoder target shape:
(2000, 21, 3198)

encoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder input data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder target data (1-hot):
[[[0. 0. 0. ... 0. 0. 0.]
 

In [4]:
class Encoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(self.latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform',
                        dropout=0.2,
                        recurrent_dropout=0.2)

    def call(self, x):
        output, state = self.lstm(x)
        return output, state


encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = Encoder(num_encoder_tokens, latent_dim, batch_size)
encoder_outputs, state = encoder(encoder_inputs)

print(f"encoder_inputs: {encoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"encoder_outputs: {encoder_outputs.shape}")

2025-06-25 13:57:29.628823: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2025-06-25 13:57:29.670932: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-06-25 13:57:29.676839: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2025-06-25 13:57:29.678506: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (2780c4aad032): /proc/driver/nvidia/version does not exist
2025-06-25 13:57:29.687902: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the ap

encoder_inputs: (None, None, 2165)
state: (None, 128)
encoder_outputs: (None, None, 128)


# We can also try not to use hidden in Decoder (i.e., remove `initial_state=hidden`), all information comes from attention layers

In [5]:
import tensorflow as tf

class Decoder(Model):
    def __init__(self, vocab_size, latent_dim, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.latent_dim = latent_dim
        self.lstm = GRU(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform',
                        dropout=0.2,
                        recurrent_dropout=0.2)
        self.fc = Dense(vocab_size, activation='softmax')
        self.attention = tf.keras.layers.Attention(name='attention_layer')

    def call(self, x, hidden, enc_output):
        print(f"x: {x.shape}")
        print(f"hidden: {hidden.shape}")
        print(f"enc_output: {enc_output.shape}")
        
        rnn_output, rnn_state = self.lstm(x)
        print(f"rnn_output: {rnn_output.shape}")
        
        context_vector = self.attention([rnn_output, enc_output])
        decoder_combined_context = tf.keras.layers.Concatenate(axis=-1)([rnn_output, context_vector])
        y = self.fc(decoder_combined_context)

        return y, rnn_state


decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder = Decoder(num_decoder_tokens, latent_dim, batch_size)
decoder_outputs, _ = decoder(decoder_inputs, hidden=state, enc_output=encoder_outputs)


print(f"decoder_inputs: {decoder_inputs.shape}")
print(f"state: {state.shape}")
print(f"decoder_outputs: {decoder_outputs.shape}")

x: (None, None, 3198)
hidden: (None, 128)
enc_output: (None, None, 128)
rnn_output: (None, None, 128)
decoder_inputs: (None, None, 3198)
state: (None, 128)
decoder_outputs: (None, None, 3198)


In [6]:
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
    # NOTE: this learning rate is for at least 16,000 sample on GPU server...
    # initial_learning_rate=0.005,
    # decay_steps=3200,
    initial_learning_rate=0.001,
    decay_steps=200,
    decay_rate=0.9
)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler, clipnorm=1.0), loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 2165)] 0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 3198)] 0                                            
__________________________________________________________________________________________________
encoder (Encoder)               ((None, None, 128),  881280      input_1[0][0]                    
__________________________________________________________________________________________________
decoder (Decoder)               ((None, None, 3198), 2099838     input_2[0][0]                    
                                                                 encoder[0][0]                

In [7]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
        batch_size=batch_size,
        epochs=3,
        validation_split=0.2)

2025-06-25 13:59:08.120007: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2025-06-25 13:59:08.121803: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2196495000 Hz


Epoch 1/3
x: (16, 21, 3198)
hidden: (16, 128)
enc_output: (16, 19, 128)
rnn_output: (16, 21, 128)
x: (16, 21, 3198)
hidden: (16, 128)
enc_output: (16, 19, 128)
rnn_output: (16, 21, 128)
hidden: (16, 128)
enc_output: (16, 19, 128)
rnn_output: (16, 21, 128)
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f06d70fa550>

In [8]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [9]:
encoder_model = Model(encoder_inputs, [encoder_outputs, state])
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, 2165)]      0         
_________________________________________________________________
encoder (Encoder)            ((None, None, 128), (None 881280    
Total params: 881,280
Trainable params: 881,280
Non-trainable params: 0
_________________________________________________________________


In [10]:
decoder_state_input = Input(shape=(latent_dim,))
the_encoder_output = Input(shape=(None, latent_dim,))

decoder_outputs, decoder_state = decoder(
    decoder_inputs, hidden=decoder_state_input, enc_output=the_encoder_output)

decoder_model = Model(
    [decoder_inputs] + [decoder_state_input] + [the_encoder_output],
    [decoder_outputs] + [decoder_state])

decoder_model.summary()

x: (None, None, 3198)
hidden: (None, 128)
enc_output: (None, None, 128)
rnn_output: (None, None, 128)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None, 3198)] 0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 128)]  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
decoder (Decoder)               ((None, None, 3198), 2099838     input_2[0][0]           

In [11]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    encoder_output_value, states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, the_decoder_state = decoder_model.predict(
            [target_seq] + [states_value] + [encoder_output_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = the_decoder_state

    return decoded_sentence


for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

x: (None, 1, 3198)
hidden: (None, 128)
enc_output: (None, 19, 128)
rnn_output: (None, 1, 128)
-
Input sentence: ['i', 'sometimes', 'still', 'think', 'about', 'her']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['if', 'i', 'had', 'one', 'million', 'yen', 'now', 'i', 'would', 'buy', 'a', 'car']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['the', 'time', 'has', 'come', 'when', 'i', 'must', 'tell', 'you', 'the', 'truth']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ["it's", 'already', 'nine', "o'clock"]
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['come', 'along', 'with', 'us']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['there', 'was', 'a', 'castle', 'here', 'many', 'years', 'ago']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['his', 'words', 'gave', 'me', 'hope']
Decoded sentence: 我我我我我我我我我我我我我我我我我我我我我我
-
Input sentence: ['i', 'wish', 'i', 'were', 'a', 'good', 'singer']
Decoded sentence: 我我我我我我我我我我我我我