In [4]:
from keras.models import Model
from keras.layers import Input, CuDNNLSTM, Embedding, Dense
from keras.utils import plot_model
import numpy as np
import sentencepiece as spm

batch_size = 64
epochs = 100
hidden_dims = 256
num_samples = 45093

data_file = "jpn.txt"
enc_input_tokens = []
dec_input_tokens = []
dec_target_tokens = []
start_token_id = 1
end_token_id = 2

with open(data_file, "r", encoding="utf-8") as f:
    lines_list = f.read().split("\n")

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("spm.model")

for line in lines_list:
    #for the last black data, we need to skip
    if line == "":
        break
    source_text, target_text = line.split("\t")
    tokenized_source_text = tokenizer.EncodeAsPieces(source_text)
    #test = tokenizer.EncodeAsIds(source_text)
    #print(test)
    tokenized_target_text = tokenizer.EncodeAsPieces(target_text)
    
    int_tokenized_source = []
    int_tokenized_input_target = []
    int_tokenized_output_target = []
    for token in tokenized_source_text:
        int_tokenized_source.append(tokenizer.piece_to_id(token))
    for i, token in enumerate(tokenized_target_text):
        if i == 0:
            int_tokenized_input_target.append(start_token_id)
            continue
        int_tokenized_input_target.append(tokenizer.piece_to_id(token))
        int_tokenized_output_target.append(tokenizer.piece_to_id(token))
        
    int_tokenized_output_target.append(end_token_id)
    
    if len(int_tokenized_output_target) != len(int_tokenized_input_target):
        print("Error")
        
    enc_input_tokens.append(int_tokenized_source)
    dec_input_tokens.append(int_tokenized_input_target)
    dec_target_tokens.append(int_tokenized_output_target)

len(enc_input_tokens), len(dec_input_tokens), len(dec_target_tokens)

(45093, 45093, 45093)

In [5]:
max_enc_seq = max([len(i) for i in enc_input_tokens])
max_dec_seq = max([len(i) for i in dec_input_tokens])

max_enc_seq, max_dec_seq

(128, 61)

In [6]:
from tqdm import tqdm

def pad_or_truncate_inputs(data, max_len):
    new_data = []
    pad_id = 0
        
    for sample in tqdm(data):
        if len(sample) >= max_len:
            tmp = sample[:max_len]
        else:
            tmp = sample
            num_of_pads_needed = max_len - len(sample)
            for _ in range(num_of_pads_needed):
                tmp.append(pad_id)
                
        new_data.append(tmp)
        
    return new_data

In [7]:
enc_input_tokens = pad_or_truncate_inputs(enc_input_tokens, max_enc_seq)
dec_input_tokens = pad_or_truncate_inputs(dec_input_tokens, max_dec_seq)
dec_target_tokens = pad_or_truncate_inputs(dec_target_tokens, max_dec_seq)

enc_input_tokens = np.array(enc_input_tokens)
dec_input_tokens = np.array(dec_input_tokens)
dec_target_tokens = np.array(dec_target_tokens)

enc_input_tokens.shape, dec_input_tokens.shape, dec_target_tokens.shape

100%|██████████| 45093/45093 [00:00<00:00, 46824.35it/s]
100%|██████████| 45093/45093 [00:00<00:00, 86567.24it/s]
100%|██████████| 45093/45093 [00:00<00:00, 87853.67it/s]


((45093, 128), (45093, 61), (45093, 61))

In [None]:
enc_input_tokens

array([[1168,  192,  106, ...,    0,    0,    0],
       [1168,  192,  106, ...,    0,    0,    0],
       [1354,  241,  106, ...,    0,    0,    0],
       ...,
       [   6, 1945,    6, ...,    0,    0,    0],
       [1027,    6,  945, ...,  368, 2764,  106],
       [3174,    6,  497, ...,    0,    0,    0]])

In [None]:
dec_target_tokens

array([[ 167,  420,    5, ...,    0,    0,    0],
       [2620,   40, 3654, ...,    0,    0,    0],
       [ 331,  176,   10, ...,    0,    0,    0],
       ...,
       [3617,  702, 3135, ...,    0,    0,    0],
       [4701, 4548, 2637, ...,    0,    0,    0],
       [ 419,  502, 1317, ...,    0,    0,    0]])

In [None]:
from keras.utils import to_categorical

vocab_size = 8000
one_hot_label = to_categorical(dec_target_tokens, num_classes=vocab_size)

In [18]:
vocab_size = 8000
hidden_dims = 256
emb_dims = 300

##Network Architecture
enc_inputs = Input(shape=(max_enc_seq,))
enc_emb = Embedding(vocab_size, emb_dims)(enc_inputs)
enc = CuDNNLSTM(hidden_dims, return_state=True)
_, state_h, state_c = enc(enc_emb)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(max_dec_seq,))
dec_emb = Embedding(vocab_size, emb_dims)(dec_inputs)
dec = CuDNNLSTM(hidden_dims, return_sequences=True, return_state=True)
dec_outputs, _, _ = dec(dec_emb, initial_state=enc_states)
dec_dense = Dense(1, activation="sigmoid")
dec_outputs = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], dec_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 128)          0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 61)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 128, 300)     2400000     input_10[0][0]                   
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 61, 300)      2400000     input_11[0][0]                   
__________________________________________________________________________________________________
cu_dnnlstm

In [19]:
plot_model(model, to_file="seq2seq.png", show_shapes=True)

In [20]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.fit([enc_input_tokens, dec_input_tokens],
          dec_target_tokens,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2
         )

ValueError: Error when checking target: expected dense_4 to have 3 dimensions, but got array with shape (45093, 61)

In [28]:
output = ""
for i in target_tokens[2]:
    output += tokenizer.IdToPiece(i)
    
output

'<こんにちは。>'

In [19]:
output = ""
for i in input_tokens[0]:
    output += tokenizer.IdToPiece(i)
    
output

'▁Go.'