In [25]:
from keras.models import Model
from keras.layers import Input, CuDNNLSTM, Embedding, Dense
from keras.utils import plot_model
import numpy as np
import sentencepiece as spm

batch_size = 64
epochs = 100
hidden_dims = 256
num_samples = 45093

data_file = "jpn.txt"
enc_input_tokens = []
dec_input_tokens = []
dec_target_tokens = []
start_token = "<"
end_token = ">"

with open(data_file, "r", encoding="utf-8") as f:
    lines_list = f.read().split("\n")

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("spm.model")

for line in lines_list:
    #for the last black data, we need to skip
    if line == "":
        break
    source_text, target_text = line.split("\t")
    target_text = start_token + target_text
    tokenized_source_text = tokenizer.EncodeAsPieces(source_text)
    tokenized_target_text = tokenizer.EncodeAsPieces(target_text)
    
    int_tokenized_source = []
    int_tokenized_input_target = []
    int_tokenized_output_target = []
    for token in tokenized_source_text:
        int_tokenized_source.append(tokenizer.piece_to_id(token))
    for i, token in enumerate(tokenized_target_text):
        if i == 0:
            continue
        int_tokenized_input_target.append(tokenizer.piece_to_id(token))
        
        if i == 1:
            continue
        int_tokenized_output_target.append(tokenizer.piece_to_id(token))
        
    int_tokenized_output_target.append(tokenizer.piece_to_id(end_token))
    
    if len(int_tokenized_output_target) != len(int_tokenized_input_target):
        print("Error")
        
    enc_input_tokens.append(int_tokenized_source)
    dec_input_tokens.append(int_tokenized_input_target)
    dec_target_tokens.append(int_tokenized_output_target)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

(45093, 45093, 45093)

In [26]:
max_enc_seq = max([len(i) for i in enc_input_tokens])
max_dec_seq = max([len(i) for i in dec_input_tokens])

max_enc_seq, max_dec_seq

(128, 62)

In [27]:
from tqdm import tqdm

def pad_or_truncate_inputs(data, max_len):
    new_data = []
    pad = 0
        
    for sample in tqdm(data):
        if len(sample) >= max_len:
            tmp = sample[:max_len]
        else:
            tmp = sample
            num_of_pads_needed = max_len - len(sample)
            for _ in range(num_of_pads_needed):
                tmp.append(pad)
                
        new_data.append(tmp)
        
    return new_data

In [28]:
enc_input_tokens = pad_or_truncate_inputs(enc_input_tokens, max_enc_seq)
dec_input_tokens = pad_or_truncate_inputs(dec_input_tokens, max_dec_seq)
dec_target_tokens = pad_or_truncate_inputs(dec_target_tokens, max_dec_seq)

enc_input_tokens = np.array(enc_input_tokens)
dec_input_tokens = np.array(dec_input_tokens)
dec_target_tokens = np.array(dec_target_tokens)

len(enc_input_tokens.shape), len(dec_input_tokens.shape), len(dec_target_tokens.shape)

100%|██████████| 45093/45093 [00:00<00:00, 45565.37it/s]
100%|██████████| 45093/45093 [00:00<00:00, 81987.81it/s]
100%|██████████| 45093/45093 [00:00<00:00, 78140.43it/s]


(2, 2, 2)

In [23]:
enc_input_tokens

array([list([1168, 192, 106]), list([1168, 192, 106]),
       list([1354, 241, 106]), ...,
       list([6, 1945, 6, 436, 1252, 6, 368, 2764, 6, 566, 1184, 1078, 1264, 6, 1041, 6, 413, 192, 1264, 6, 2055, 1148, 2163, 4378, 751, 1146, 434, 282, 508, 597, 6, 22, 106, 848, 5602, 657, 413, 4418, 6, 544, 531, 2291, 4444, 2253, 368, 2565, 597, 1146, 434, 282, 508, 597, 135, 6, 334, 918, 5672, 6, 706, 216, 1256, 106, 6, 1945, 6, 3997, 3222, 268, 918, 6, 436, 1252, 6, 2055, 2291, 4444, 2253, 368, 2565, 597, 1146, 434, 282, 508, 597, 6, 597, 160, 192, 6, 918, 3679, 1196, 6, 544, 995, 282, 6, 1195, 1041, 6, 1371, 3777, 106]),
       list([1027, 6, 945, 1918, 497, 6, 706, 3903, 6, 1041, 6, 216, 6, 368, 1055, 4491, 6, 160, 334, 116, 3552, 434, 6, 1252, 459, 4158, 286, 6, 394, 368, 2764, 160, 6, 5033, 286, 6, 1195, 1264, 160, 6, 3669, 2362, 268, 6, 522, 1041, 6, 531, 6, 522, 434, 6, 486, 5538, 459, 5503, 6, 1195, 1055, 6, 216, 6, 368, 521, 82, 368, 1055, 4491, 6, 160, 334, 116, 3552, 434, 6, 706, 39

In [15]:
dec_input_tokens[100:110]

[[2003, 1721, 9, 2392, 5],
 [2003, 1136, 3247, 4792, 232, 675, 2516, 5],
 [2003, 6043, 97, 4, 2322, 213, 947],
 [2003, 414, 105, 3371, 947],
 [2003, 973, 14, 324, 31, 21, 40, 4, 2607],
 [2003, 1600, 50, 1136, 40, 4, 2607],
 [2003, 44, 80, 10, 1774, 89, 40, 3654, 5],
 [2003, 44, 80, 10, 1774, 798, 5],
 [2003, 44, 80, 10, 1774, 798, 1151, 97, 176, 5],
 [2003, 156, 1151, 41, 3371, 947]]

In [16]:
dec_target_tokens[100:110]

[[1721, 9, 2392, 5, 1954],
 [1136, 3247, 4792, 232, 675, 2516, 5, 1954],
 [6043, 97, 4, 2322, 213, 947, 1954],
 [414, 105, 3371, 947, 1954],
 [973, 14, 324, 31, 21, 40, 4, 2607, 1954],
 [1600, 50, 1136, 40, 4, 2607, 1954],
 [44, 80, 10, 1774, 89, 40, 3654, 5, 1954],
 [44, 80, 10, 1774, 798, 5, 1954],
 [44, 80, 10, 1774, 798, 1151, 97, 176, 5, 1954],
 [156, 1151, 41, 3371, 947, 1954]]

In [18]:
vocab_size = 8000
hidden_dims = 256
emb_dims = 300

##Network Architecture
enc_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size, emb_dims)(enc_inputs)
enc = CuDNNLSTM(hidden_dims, return_state=True)
_, state_h, state_c = enc(enc_emb)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(None,))
dec_emb = Embedding(vocab_size, emb_dims)(dec_inputs)
dec = CuDNNLSTM(hidden_dims, return_sequences=True, return_state=True)
dec_outputs, _, _ = dec(dec_emb, initial_state=enc_states)
dec_dense = Dense(vocab_size, activation="softmax")
dec_outputs = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], dec_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2400000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    2400000     input_6[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm

In [19]:
plot_model(model, to_file="seq2seq.png", show_shapes=True)

In [20]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.fit([enc_input_tokens, dec_input_tokens],
          dec_target_tokens,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2
         )

ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 45093 arrays: [array([[ 167],
       [ 420],
       [   5],
       [1954]]), array([[2620],
       [  40],
       [3654],
       [   5],
       [1954]]), array([[ 331],
       [ 176],
       [  10],
       [ 404],
...

In [28]:
output = ""
for i in target_tokens[2]:
    output += tokenizer.IdToPiece(i)
    
output

'<こんにちは。>'

In [19]:
output = ""
for i in input_tokens[0]:
    output += tokenizer.IdToPiece(i)
    
output

'▁Go.'