In [1]:
from __future__ import absolute_import, division, print_function
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pprint import pprint

import tensorflow as tf
import numpy as np
import os

tf.enable_eager_execution()

In [2]:
sources = [['I', 'feel', 'hungry'],
          ['tensorflow', 'is', 'very', 'difficult'],
          ['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
          ['tensorflow', 'is', 'very', 'fast', 'changing']]
targets = [['나는', '배가', '고프다'],
          ['텐서플로우는', '매우', '어렵다'],
          ['텐서플로우는', '딥러닝을', '위한', '프레임워크이다'],
          ['텐서플로우는', '매우', '빠르게', '변화한다']]

In [3]:
s_vocab = list(set(sum(sources, [])))
s_vocab.sort()
s_vocab = ['<pad>'] + s_vocab

In [4]:
source2idx = {word: idx for idx, word in enumerate(s_vocab)}
idx2source = {idx: word for idx, word in enumerate(s_vocab)}

In [5]:
pprint(source2idx)

{'<pad>': 0,
 'I': 1,
 'a': 2,
 'changing': 3,
 'deep': 4,
 'difficult': 5,
 'fast': 6,
 'feel': 7,
 'for': 8,
 'framework': 9,
 'hungry': 10,
 'is': 11,
 'learning': 12,
 'tensorflow': 13,
 'very': 14}


In [6]:
pprint(idx2source)

{0: '<pad>',
 1: 'I',
 2: 'a',
 3: 'changing',
 4: 'deep',
 5: 'difficult',
 6: 'fast',
 7: 'feel',
 8: 'for',
 9: 'framework',
 10: 'hungry',
 11: 'is',
 12: 'learning',
 13: 'tensorflow',
 14: 'very'}


In [7]:
t_vocab = list(set(sum(targets, [])))
t_vocab.sort()
t_vocab = ['<pad>', '<bos>', '<eos>'] + t_vocab

In [8]:
target2idx = {word: idx for idx, word in enumerate(t_vocab)}
idx2target = {idx: word for idx, word in enumerate(t_vocab)}

In [9]:
pprint(target2idx)

{'<bos>': 1,
 '<eos>': 2,
 '<pad>': 0,
 '고프다': 3,
 '나는': 4,
 '딥러닝을': 5,
 '매우': 6,
 '배가': 7,
 '변화한다': 8,
 '빠르게': 9,
 '어렵다': 10,
 '위한': 11,
 '텐서플로우는': 12,
 '프레임워크이다': 13}


In [10]:
pprint(idx2target)

{0: '<pad>',
 1: '<bos>',
 2: '<eos>',
 3: '고프다',
 4: '나는',
 5: '딥러닝을',
 6: '매우',
 7: '배가',
 8: '변화한다',
 9: '빠르게',
 10: '어렵다',
 11: '위한',
 12: '텐서플로우는',
 13: '프레임워크이다'}


In [11]:
def preprocess(sequences, max_len, dic, mode='source') :
    assert mode in ['source', 'target'], 'source와 target 중에 선택해주세요.'
    
    if mode == 'source' :
        #ENCODER
        s_input = list(map(lambda sentence: [dic.get(token) for token in sentence], sequences))
        s_len = list(map(lambda sentence: len(sentence), s_input))
        s_input = pad_sequences(sequences=s_input, maxlen=max_len, padding='post', truncating='post')
        
        return s_len, s_input
    
    elif mode == 'target' :
        #DECODER
        t_input = list(map(lambda sentence: ['<bos>']+sentence+['<eos>'], sequences))
        t_input = list(map(lambda sentence: [dic.get(token) for token in sentence], t_input))
        t_len = list(map(lambda sentence: len(sentence), t_input))
        t_input = pad_sequences(sequences=t_input, maxlen=max_len, padding='post', truncating='post')
        
        t_output = list(map(lambda sentence: sentence+['<eos>'], sequences))
        t_output = list(map(lambda sentence: [dic.get(token) for token in sentence], t_output))
        t_output = pad_sequences(sequences=t_output, maxlen=max_len, padding='post', truncating='post')
        
        return t_len, t_input, t_output

In [12]:
s_max_len = 10
s_len, s_input = preprocess(sequences=sources, max_len=s_max_len, dic=source2idx, mode='source')

In [13]:
pprint(s_len)
pprint(s_input)

[3, 4, 7, 5]
array([[ 1,  7, 10,  0,  0,  0,  0,  0,  0,  0],
       [13, 11, 14,  5,  0,  0,  0,  0,  0,  0],
       [13, 11,  2,  9,  8,  4, 12,  0,  0,  0],
       [13, 11, 14,  6,  3,  0,  0,  0,  0,  0]])


In [14]:
t_max_len = 12
t_len, t_input, t_output = preprocess(sequences=targets, max_len=t_max_len, dic=target2idx, mode='target')

In [15]:
pprint(t_len)
pprint(t_input)
pprint(t_output)

[5, 5, 6, 6]
array([[ 1,  4,  7,  3,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12,  6, 10,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12,  5, 11, 13,  2,  0,  0,  0,  0,  0,  0],
       [ 1, 12,  6,  9,  8,  2,  0,  0,  0,  0,  0,  0]])
array([[ 4,  7,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  6, 10,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  5, 11, 13,  2,  0,  0,  0,  0,  0,  0,  0],
       [12,  6,  9,  8,  2,  0,  0,  0,  0,  0,  0,  0]])


# hyper parameter

In [16]:
epochs = 100
batch_size = 4
learning_rate = 5e-3
total_step = epochs/batch_size
buffer_size = 100
n_batch = buffer_size//batch_size
embedding_dim = 32
units = 32

# input

In [17]:
data = tf.data.Dataset.from_tensor_slices((s_len, s_input, t_len, t_input, t_output))
data = data.shuffle(buffer_size=buffer_size)
data = data.batch(batch_size=batch_size)

In [18]:
def gru(units) :
    return tf.keras.layers.CuDNNGRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

In [19]:
class Encoder(tf.keras.Model) :
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size) :
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden) :
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        
        return output, state
    
    def initialize_hidden_state(self) :
        return tf.zeros((self.batch_size, self.enc_units))

In [20]:
class Decoder(tf.keras.Model) :
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size) :
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x, hidden, enc_output) :
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        
        return x, state
    
    def initialize_hidden_state(self) :
        return tf.zeros((self.batch_size, self.dec_units))

In [21]:
encoder = Encoder(len(source2idx), embedding_dim, units, batch_size)
decoder = Decoder(len(target2idx), embedding_dim, units, batch_size)
pprint(encoder)
pprint(decoder)

<__main__.Encoder object at 0x000001473123F828>
<__main__.Decoder object at 0x000001473123F898>


In [22]:
def loss_function(real, pred) :
    mask = 1-np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    
    return tf.reduce_mean(loss_)

In [23]:
optimizer = tf.train.AdamOptimizer()

In [24]:
checkpoint_dir = './data_out/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

summary_writer = tf.contrib.summary.create_file_writer(logdir=checkpoint_dir)

W0521 23:03:17.630368 33380 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [25]:
result = float('inf')
epoch = 0

while result > 0.05 :
    epoch += 1
    hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for i, (s_len, s_input, t_len, t_input, t_output) in enumerate(data) :
        loss = 0

        with tf.GradientTape() as tape :
            enc_output, enc_hidden = encoder(s_input, hidden)
            dec_hidden = enc_hidden
            dec_input = tf.expand_dims([target2idx['<bos>']]*batch_size, 1)

            for t in range(1, t_input.shape[1]) :
                predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
                loss += loss_function(t_input[:, t], predictions)
                dec_input = tf.expand_dims(t_input[:, t], 1)

        batch_loss = (loss/int(t_input.shape[1]))
        result = batch_loss.numpy()
        total_loss += batch_loss
        variables = encoder.variables+decoder.variables
        gradient = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradient, variables))

    if epoch%10 == 0 :
        print('Epoch: {},  Loss: {:.4f},  Batch Loss: {:.4f}'.format(epoch, total_loss/n_batch, batch_loss.numpy()))
        checkpoint.save(file_prefix=checkpoint_prefix)

Epoch: 10,  Loss: 0.0387,  Batch Loss: 0.9671
Epoch: 20,  Loss: 0.0373,  Batch Loss: 0.9316
Epoch: 30,  Loss: 0.0346,  Batch Loss: 0.8657
Epoch: 40,  Loss: 0.0304,  Batch Loss: 0.7599
Epoch: 50,  Loss: 0.0269,  Batch Loss: 0.6713
Epoch: 60,  Loss: 0.0236,  Batch Loss: 0.5907
Epoch: 70,  Loss: 0.0207,  Batch Loss: 0.5174
Epoch: 80,  Loss: 0.0179,  Batch Loss: 0.4485
Epoch: 90,  Loss: 0.0154,  Batch Loss: 0.3847
Epoch: 100,  Loss: 0.0131,  Batch Loss: 0.3271
Epoch: 110,  Loss: 0.0111,  Batch Loss: 0.2770
Epoch: 120,  Loss: 0.0094,  Batch Loss: 0.2353
Epoch: 130,  Loss: 0.0080,  Batch Loss: 0.2012
Epoch: 140,  Loss: 0.0070,  Batch Loss: 0.1743
Epoch: 150,  Loss: 0.0061,  Batch Loss: 0.1530
Epoch: 160,  Loss: 0.0054,  Batch Loss: 0.1358
Epoch: 170,  Loss: 0.0049,  Batch Loss: 0.1218
Epoch: 180,  Loss: 0.0044,  Batch Loss: 0.1104
Epoch: 190,  Loss: 0.0041,  Batch Loss: 0.1017
Epoch: 200,  Loss: 0.0038,  Batch Loss: 0.0953
Epoch: 210,  Loss: 0.0036,  Batch Loss: 0.0907
Epoch: 220,  Loss: 0.0

In [26]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x14758cb1860>

In [27]:
def prediction(sentence, encoder, decoder, inp_lang, tar_lang, max_length_inp, max_length_tar) :
    result = ''
    
    inputs = [inp_lang[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
        
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tar_lang['<bos>']], 0)
    
    for t in range(max_length_tar) :
        predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += idx2target[predicted_id]+' '
        
        if idx2target[predicted_id] == '<eos>' :
            return result, sentence
        
        dec_input = tf.expand_dims([predicted_id], 0)
        
    return result, sentence

In [31]:
sentence = 'tensorflow is very difficult'

In [32]:
result, output_sentence = prediction(sentence, encoder, decoder, source2idx, target2idx, s_max_len, t_max_len)

In [33]:
pprint(result)

'텐서플로우는 매우 어렵다 <eos> '
