In [1]:
! ls ../../data/

CrowdFlowerAnnotations.txt  Flickr8k.token.txt	       machine_translation
ExpertAnnotations.txt	    Flickr_8k.devImages.txt    readme.txt
Flicker8k_smaller	    Flickr_8k.testImages.txt
Flickr8k.lemma.token.txt    Flickr_8k.trainImages.txt


In [3]:
! pip3 install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=cbaf4a58ba852d8149ab9fddb79aab77cca091f24d679cc61669769b7f193d43
  Stored in directory: /root/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [15]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import random
import jieba

batch_size = 16  
epochs = 35  
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 2000  # Number of samples to train on.
data_path = '../../data/machine_translation/cmn.txt'

input_texts = []
target_texts = []
input_tokens = set()
target_tokens = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

# for line in lines[: min(num_samples, len(lines) - 1)]:
for line in random.sample(lines, num_samples):
    input_text, target_text = line.split('\t')
    
    # We use "\t" as the "start sequence" and "\n" as "end sequence"
    target_text = '\t' + target_text + '\n'
    
    tmp = []
    for token in input_text.split(" "):
        token = token.replace(",", "").replace(".", "").replace("?", "").lower()
        tmp.append(token)
        if token not in input_tokens:
            input_tokens.add(token)
    input_texts.append(tmp)
    
    tmp = []
    for token in jieba.cut(target_text, cut_all=False):
        tmp.append(token)
        if token not in target_tokens:
            target_tokens.add(token)
    target_texts.append(tmp)

input_tokens = list(input_tokens)
target_tokens = list(target_tokens)
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

print("\ninput data set:")
print(input_texts[:10])
print("\ntarget data set:")
print(target_texts[:10])

Number of samples: 2000
Number of unique input tokens: 2165
Number of unique output tokens: 3148
Max sequence length for inputs: 25
Max sequence length for outputs: 26

input data set:
[['when', 'does', 'the', 'game', 'begin'], ['i', 'can', 'understand', 'him', 'perfectly'], ['the', 'school', 'is', 'on', 'top', 'of', 'the', 'hill'], ["i'm", 'sure', "you'll", 'never', 'regret', 'it'], ['who', 'likes', 'beans'], ["we've", 'decided', 'not', 'to', 'fire', 'you'], ['the', 'captain', 'is', 'responsible', 'for', 'the', 'safety', 'of', 'passengers'], ["it's", 'up', 'to', 'you', 'to', 'decide'], ['they', 'made', 'him', 'do', 'the', 'work', 'again'], ['she', 'lives', 'in', 'a', 'huge', 'house']]

target data set:
[['\t', '游戏', '几点', '开始', '？', '\n'], ['\t', '我', '十分', '能', '理解', '他', '。', '\n'], ['\t', '學校', '在', '山丘', '頂上', '。', '\n'], ['\t', '我', '確定', '你', '永不', '後', '悔', '。', '\n'], ['\t', '谁', '喜欢', '豆子', '？', '\n'], ['\t', '我们', '决定', '不', '开除', '你', '。', '\n'], ['\t', '船长', '要', '为', '乘客'

In [16]:
input_token_index = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

print("\ninput index:")
print(input_token_index)

print("\ntarget index:")
print(target_token_index)


input index:
{'hours': 0, 'liked': 1, 'anywhere': 2, 'learning': 3, 'tall': 4, 'clothes': 5, 'seriously': 6, 'tastes': 7, 'carelessness': 8, 'others': 9, 'ceremony': 10, 'different': 11, 'task': 12, 'sixteenth': 13, 'kyoto': 14, 'another': 15, 'shown': 16, 'independent': 17, "you've": 18, 'sofa': 19, 'important': 20, "friend's": 21, 'second': 22, 'repeatedly': 23, 'serious': 24, 'lonely': 25, 'back"': 26, 'appointment': 27, 'cheek': 28, 'own': 29, 'snakes': 30, 'suggested': 31, 'very': 32, 'for': 33, 'wearing': 34, 'belong': 35, 'lot': 36, 'laid': 37, 'questions': 38, 'damaged': 39, 'thirty': 40, 'sleep': 41, 'dinner': 42, 'word': 43, 'quality': 44, 'fool': 45, 'iran': 46, 'sold': 47, 'dallas': 48, 'shadow': 49, 'ambition': 50, 'marie': 51, 'car': 52, 'reached': 53, 'all!': 54, 'buying': 55, 'currently': 56, 'down': 57, 'success': 58, 'puzzled': 59, 'cutest': 60, 'davis': 61, 'ran': 62, "author's": 63, 'sock': 64, 'snowed': 65, 'paris': 66, 'short': 67, 'reading': 68, 'apartment': 69,

In [20]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[token]] = 1.
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[token]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

print("\nencoder shape:")
print(encoder_input_data.shape)
print("\ndecoder input shape:")
print(decoder_input_data.shape)
print("\ndecoder target shape:")
print(decoder_target_data.shape)
            
print("\nencoder data:")
print(encoder_input_data[-2:])
print("\ndecoder input data:")
print(decoder_input_data[-2:])
print("\ndecoder target data:")
print(decoder_target_data[-2:])


encoder shape:
(2000, 25, 2165)

decoder input shape:
(2000, 26, 3148)

decoder target shape:
(2000, 26, 3148)

encoder data:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder input data:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

decoder target data:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0

In [21]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

print(state_h)
print(state_c)

2023-03-18 06:08:01.399773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-18 06:08:01.417940: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-18 06:08:01.448592: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (7ecf76a70477): /proc/driver/nvidia/version does not exist
2023-03-18 06:08:01.495043: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name=None), name='lstm/PartitionedCall:2', description="created by layer 'lstm'")
KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name=None), name='lstm/PartitionedCall:3', description="created by layer 'lstm'")


In [22]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

print(decoder_outputs)

KerasTensor(type_spec=TensorSpec(shape=(None, None, 3148), dtype=tf.float32, name=None), name='dense/Softmax:0', description="created by layer 'dense'")


In [23]:
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 2165)  0           []                               
                                ]                                                                 
                                                                                                  
 input_2 (InputLayer)           [(None, None, 3148)  0           []                               
                                ]                                                                 
                                                                                                  
 lstm (LSTM)                    [(None, 128),        1174528     ['input_1[0][0]']                
                                 (None, 128),                                                 

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

2023-03-18 06:08:28.087880: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 346400000 exceeds 10% of free system memory.
2023-03-18 06:08:29.646787: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 523827200 exceeds 10% of free system memory.
2023-03-18 06:08:36.201194: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 523827200 exceeds 10% of free system memory.


Epoch 1/35

2023-03-18 06:09:12.226443: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 86600000 exceeds 10% of free system memory.
2023-03-18 06:09:12.272447: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 130956800 exceeds 10% of free system memory.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35