In [15]:
import json
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import pandas as pd

def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                if len(data['chinese'])<6:
                    lists.append(data)
                if (len(lists)+1)%100 == 0:
                    break

    df = pd.DataFrame(lists)
    df.to_csv('datafile.csv', encoding='utf-8', index=False)
    

init()
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
en_data = df.english.values.tolist()
ch_data = df.chinese.values.tolist()
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

In [16]:
def num_data(en2id,ch2id,en_data):
    en_num_data = [[en2id[en] for en in line ] for line in en_data]
    ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
    de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]
 
    max_encoder_seq_length = max([len(txt) for txt in en_num_data])
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max encoder length:', max_encoder_seq_length)
    print('max decoder length:', max_decoder_seq_length)

    encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
    decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
    decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(en_num_data[i]):
            encoder_input_data[i, t, j] = 1.
        for t, j in enumerate(ch_num_data[i]):
            decoder_input_data[i, t, j] = 1.
        for t, j in enumerate(de_num_data[i]):
            decoder_target_data[i, t, j] = 1.

    print('index data:\n', en_num_data[1])
    print('one hot data:\n', encoder_input_data[1])
    return encoder_input_data,decoder_input_data,decoder_target_data

nd = num_data(en2id,ch2id,en_data)

max encoder length: 75
max decoder length: 7
index data:
 [5, 41, 57, 49, 50, 28, 33, 49, 3, 24, 33, 33, 57, 33, 33, 57, 0, 49, 25, 62, 49, 28, 49, 0, 57, 51, 14, 45, 53]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256
LEARNING_RATE = 0.01
BATCH_SIZE = 20
EPOCHS = 250

def train():
    encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
    encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
    encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)
    decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))
    lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

    decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
    decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    model.fit([nd[0], nd[1]], nd[2],batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=0.)
    model.save('s2s.h5')# Save model
    encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])# encoder模型和訓練相同
    decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))# 預測模型中的decoder的初始化狀態需要傳入新的狀態
    decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))
    decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])# 使用傳入的值來初始化當前模型的輸入狀態
    decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                        [decoder_outputs, state_h1, state_c1, state_h2, state_c2])
    return encoder_model,decoder_model

model = train()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, None, 67)]   0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, None, 281)]  0                                            
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, None, 256),  331776      input_17[0][0]                   
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 256),  550912      input_18[0][0]                   
                                                                 lstm_14[0][1]              

Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epo

Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 

Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


In [18]:
def result():    
    for k in range(0,99):
        test_data = nd[0][k:k+1]
        h1, c1, h2, c2 = model[0].predict(test_data)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, ch2id['@']] = 1
        outputs = []
        while True:
            output_tokens, h1, c1, h2, c2 = model[1].predict([target_seq, h1, c1, h2, c2])
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            outputs.append(sampled_token_index)
            target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
            target_seq[0, 0, sampled_token_index] = 1
            if sampled_token_index == ch2id['。'] or len(outputs) > 10:
                break
        
        print(en_data[k])
        print(''.join([id2ch[i] for i in outputs]))

result()

Side-to-Side Movements.
我保证。
She was possessed by a devil.
我保证。
The majority was wrong last time.
我保证。
Great talents flower late.
我保证。
Erhai Lake （in Yunnan Province）
他不专上。
It strikes one as very strange.
我保证。
Did I divide right?
他乱摇铃。
I promise".
我保证。
All was quiet and not a soul was to be seen.
我保证。
Folder wire glass;
我保证。
Steam bath;
我保证。
Package sealing.
我保证。
Mud on salt.
他乱摇铃。
What street?
我保证。
You must be very quiet. Hold tight to me.
他乱摇铃。
Frozen ocean shrimp;
我保证。
It was he.
我保证。
Target Line;
我保证。
Chemical potential.
他乱摇铃。
He ate no soup.
他乱摇铃。
What is to be done?
我保证。
Front wheel drive.
我保证。
All streams flow into the Huanghe River.
我保证。
A:It's on the seccond floor.
我保证。
All the boats and carts started off at the same time.
我保证。
The Univ.
我保证。
The printing plant.
我保证。
It looks like rain.
我保证。
Straight tool holder .
我保证。
I'm afraid of dying a violent death.
我怕暴死。
How do they channel it?
他乱摇铃。
You love me, then?
他乱摇铃。
The fragrance of flowers assails one's nose.
我保证。
Octal Digit?