In [1]:
import json

file = open('translation2019zh_train.json' , 'r',encoding='utf-8') 

en_data=[]
ch_data=[]

for line in file.readlines():
    tmp=json.loads(line)
    en_data.append(tmp['english'])
    ch_data.append(tmp['chinese'])

#print(en[5161433])
#print(ch[5161433])
#print(en[:10])
#print(ch[:10])

file.close()

In [2]:
# 分別生成中英文字典
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分別生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

#print(en2id)

In [3]:
en_num_data = [[en2id[en] for en in line ] for line in en_data]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])

char: I didn't own a Thesaurus until four years ago and I use a small Webster's dictionary that I'd bought at K-Mart for 89 cents.
index: [98, 38, 154, 25, 154, 108, 104, 152, 38, 110, 59, 108, 38, 2, 38, 36, 48, 57, 39, 2, 139, 144, 139, 39, 38, 139, 108, 152, 25, 76, 38, 37, 110, 139, 144, 38, 5, 57, 2, 144, 39, 38, 2, 24, 110, 38, 2, 108, 154, 38, 98, 38, 139, 39, 57, 38, 2, 38, 39, 43, 2, 76, 76, 38, 83, 57, 75, 39, 152, 57, 144, 104, 39, 38, 154, 25, 14, 152, 25, 110, 108, 2, 144, 5, 38, 152, 48, 2, 152, 38, 98, 104, 154, 38, 75, 110, 139, 24, 48, 152, 38, 2, 152, 38, 40, 116, 10, 2, 144, 152, 38, 37, 110, 144, 38, 85, 7, 38, 14, 57, 108, 152, 39, 78]


In [4]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float16')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float16')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float16')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

max encoder length: 256
max decoder length: 174
index data:
 [98, 38, 154, 25, 154, 108, 104, 152, 38, 110, 59, 108, 38, 2, 38, 36, 48, 57, 39, 2, 139, 144, 139, 39, 38, 139, 108, 152, 25, 76, 38, 37, 110, 139, 144, 38, 5, 57, 2, 144, 39, 38, 2, 24, 110, 38, 2, 108, 154, 38, 98, 38, 139, 39, 57, 38, 2, 38, 39, 43, 2, 76, 76, 38, 83, 57, 75, 39, 152, 57, 144, 104, 39, 38, 154, 25, 14, 152, 25, 110, 108, 2, 144, 5, 38, 152, 48, 2, 152, 38, 98, 104, 154, 38, 75, 110, 139, 24, 48, 152, 38, 2, 152, 38, 40, 116, 10, 2, 144, 152, 38, 37, 110, 144, 38, 85, 7, 38, 14, 57, 108, 152, 39, 78]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.003
BATCH_SIZE = 200
EPOCHS = 50

In [7]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.optimizers import Adam
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)

In [8]:
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [9]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 159)    0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None, 2919)   0                                            
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  425984      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 256),  3252224     input_3[0][0]                    
                                                                 lstm_3[0][1]               

<keras.callbacks.callbacks.History at 0x21828f9eb48>

In [10]:
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [18]:
for k in range(0,100):
    test_data = encoder_input_data[k:k+1]
    h1, c1, h2, c2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    #target_seq[0, 0, ch2id['\t']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq, h1, c1, h2, c2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        #if sampled_token_index == ch2id['\n'] or len(outputs) > 20: break
        if len(outputs) > 20: break
    
    print(en_data[k])
    print(''.join([id2ch[i] for i in outputs]))

Slowly and not without struggle, America began to listen.
文性物的的电化的电理机、特化、工度、工术、
I didn't own a Thesaurus until four years ago and I use a small Webster's dictionary that I'd bought at K-Mart for 89 cents.
文性物的的电化的电理机、特化、工度、工术、
portlet, you must write three short deployment descriptors: web.xml, portlet.xml, and geronimo-web.xml. (Some of these may have been generated by your IDE.)
文性物的的电化的电理机、特化、工度、工术、
Dithering is a technique that blends your colors together, making them look smoother, or just creating interesting textures.
文性物的的电化的电理机、特化、工度、工术、
This paper discusses the petrologic characteristics of the coal-bearing strata under the geologic structural background of the Tertiary coal basin in Hunchun.
文性物的的电化的电理机、特化、工度、工术、
Women over 55 are pickier about their partners than at any other time in their lives.
文性物的的电化的电理机、特化、工度、工术、
Ruben: So, to heal (with capital letters) you need to have no predilections.
文性物的的电化的电理机、特化、工度、工术、
The second encounter relates to my grandfather's treasure b

Over the years, Col Gaddafi had fallen out with both his neighbours and the West, although he had bankrolled many African leaders.
文性物的的电化的电理机、特化、工度、工术、
Hope that help is what causes, and the other want to recommend a 1.6 the following handwriting input method. thanks.
文性物的的电化的电理机、特化、工度、工术、
Why wait around hoping to be picked for the next season of The Bachelor when the land of virtual romance awaits?
文性物的的电化的电理机、特化、工度、工术、
Department stores:Projected to lose 10.2 percent of the 1.56 million jobs they had in 2008.
文性物的的电化的电理机、特化、工度、工术、
The measured value of particle, sheet resistance and film thickness have been respectively recorded in a table, please see table2.
文性物的的电化的电理机、特化、工度、工术、
Last week a male student in Zhejiang streaked to protest his school's rule that all power be shut off by 11:30 pm.
文性物的的电化的电理机、特化、工度、工术、
I am almost all white, although my fur may turn yellow in summer.
文性物的的电化的电理机、特化、工度、工术、
He outperformed everyone on the test last week.
文性物的的电化的电理机、特化、工度、工术、
Xiao li sai