In [1]:
import os 
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam, SGD

In [2]:
# some config
BATCH_SIZE = 64  # Batch size for training.
EPOCHS = 40  # Number of epochs to train for.
LATENT_DIM = 256  # Latent dimensionality of the encoding space.
NUM_SAMPLES = 10000  # Number of samples to train on.
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [3]:
# to store the data
input_texts = []            # sentence in original language 
target_texts = []           # sentence in target language 
target_texts_inputs = []    # sentence in target language offset by 1

In [4]:
# load in the data
# download the data at: http://www.manythings.org/anki/
t = 0 
for line in open('/content/spa.txt'):
  #  only keep a limited number of samples
  t += 1
  if t > NUM_SAMPLES:
    break

  # input and target are separated by \t
  if '\t' not in line:
    continue

  # split up the input translation
  input_text, translation, garbage = line.split('\t')

  # make the target input and output
  # recall we'll be using teacher forcing
  target_text = translation + '<eos>'
  target_text_input = '<sos>' + translation

  input_texts.append(input_text)
  target_texts.append(target_text)
  target_texts_inputs.append(target_text_input)
  

In [5]:
print('num samples:', len(input_texts))
print('first 10 input texts:', input_texts[:10])
print('first 10 target texts:', target_texts[:10])
print('first 10 target text inputs:', target_texts_inputs[:10])

num samples: 10000
first 10 input texts: ['Go.', 'Go.', 'Go.', 'Go.', 'Hi.', 'Run!', 'Run!', 'Run!', 'Run!', 'Run.']
first 10 target texts: ['Ve.<eos>', 'Vete.<eos>', 'Vaya.<eos>', 'Váyase.<eos>', 'Hola.<eos>', '¡Corre!<eos>', '¡Corran!<eos>', '¡Corra!<eos>', '¡Corred!<eos>', 'Corred.<eos>']
first 10 target text inputs: ['<sos>Ve.', '<sos>Vete.', '<sos>Vaya.', '<sos>Váyase.', '<sos>Hola.', '<sos>¡Corre!', '<sos>¡Corran!', '<sos>¡Corra!', '<sos>¡Corred!', '<sos>Corred.']


In [6]:
# tokenize the inputs
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

# tokenize the outputs
# don't filter out spicial characters
# otherwise <sos>, <eos> won't appear
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs) # ineficient
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

In [7]:
# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('found %s unique input tokens.' % len(word2idx_inputs))

# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print('found %s unique output tokens.' % len(word2idx_outputs))

found 2288 unique input tokens.
found 12093 unique output tokens.


In [8]:
# determine maximum length input sequence
max_len_input = max(len(s) for s in input_sequences)

# determine maximum length output sequence
max_len_target = max(len(s) for s in target_sequences)

# store number of output words for latter
# remember to add 1 since indexing starts from 1
num_words_output =  len(word2idx_outputs) + 1

In [9]:
# pad sequences so that we get a N * T matrix
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print('shape of encoder data tensor:', encoder_inputs.shape)
print('encoder_data[0]:', encoder_inputs[0])

decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_target, padding='post')
print('\nshape of decoder data tensor:', decoder_inputs.shape)
print('decoder_inputs[0]:', decoder_inputs[0])

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')
print('\nshape of decoder data tensor:', decoder_targets.shape)
print('decoder_inputs[0]:', decoder_targets[0])

shape of encoder data tensor: (10000, 5)
encoder_data[0]: [ 0  0  0  0 15]

shape of decoder data tensor: (10000, 8)
decoder_inputs[0]: [8520    0    0    0    0    0    0    0]

shape of decoder data tensor: (10000, 8)
decoder_inputs[0]: [2756    0    0    0    0    0    0    0]


In [11]:
# store all pre-traind word vectors
print('Loading word vector...')
word2vec = {}
with open('/content/glove.6B.%sd.txt' % EMBEDDING_DIM) as f:
  # Is just a space-separted text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors: ' % len(word2vec))

Loading word vector...
Found 114621 word vectors: 


In [12]:
# prepare embedding matrix
print('filling pre-trained embedding...')
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1 ) 
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))   # 3000 * 50 - V * D
for word, i in word2idx_inputs.items():
  if i < MAX_NUM_WORDS:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be zeros.
      embedding_matrix[i] = embedding_vector

filling pre-trained embedding...


In [13]:
# load per-trained word embeddings into an Embedding layer 
# note that we set trainable = False so as it keep embedding fixed.
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix],
    input_length = max_len_input,
    #trainable = False
)

In [14]:
# create targets -> can't use sparse cross entropy
# categorical cross entropy when we have sequences
decoder_targets_one_hot = np.zeros((len(input_texts), max_len_target, num_words_output)) 
for i, target_seq in enumerate(decoder_targets):
  for t, word in enumerate(target_seq):
    if word > 0:
      decoder_targets_one_hot[i, t, word] = 1

In [15]:
print('Building model...')

encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(
  LATENT_DIM,
  return_state=True,
  # dropout=0.5 # dropout not available on gpu
)
encoder_outputs, h, c = encoder(x)
# encoder_outputs, h = encoder(x) #gru

# keep only the states to pass into decoder
encoder_states = [h, c]
# encoder_states = [h]  # gru

# set up the decoder, using [h, c] as inital state.
decoder_inputs_placeholder = Input(shape=(max_len_target,))

# this word embedding will not use pre-trained vectors
# although you could
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding( decoder_inputs_placeholder )

# since the decoder is a "to_many" model we want to have
# return_sequences = True
decoder_lstm = LSTM(
  LATENT_DIM,
  return_sequences=True,
  return_state=True,
  # dropout=0.5 # dropout not available on gpu
)
decoder_outputs, _, _ = decoder_lstm(
  decoder_inputs_x,
  initial_state=encoder_states
)
# decoder_outputs, _ = decoder_gru(decoder_inputs_x, initial_state=encoder_states)

# final dense layer for prediction
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)

Building model...


In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

print('Training model...')
h = model.fit(
    [encoder_inputs, decoder_inputs],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs= EPOCHS,
    validation_split=0.2
)

Training model...


In [None]:
plt.plot(h.history['loss'] ,label='loss')
plt.plot(h.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(h.history['accuracy'] ,label='acc')
plt.plot(h.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
##### Making Predictions #####
# As with poetry example, we need to create another model
# that can take in the RNN state and previous word as input
# and accept a T=1 sequence.

# The encoder will be stand-alone
# from this we will get our initial decoder hidden state
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
#decoder_states_inputs = [decoder_state_input_h] # gru

decoder_inputs_signal = Input(shape=(1,))
decoder_inputs_signal_x = decoder_embbeding(decoder_inputs_signal)

# this time, we want to keep the states to be the output
# by out sampling model
decoder_outputs, h, c = decoder_lstm( decoder_inputs_signal_x, initial_state=decoder_states_inputs)
# decoder_outputs, h = decoder_lstm( decoder_inputs_signal_x, initial_state=decoder_states_inputs) # gru

decoder_States = [h, c]
#decoder_States = [h] # gru

# the sampling model
# inputs: y(t-1), h(t-1), c(t-1)
# outputs: y(t), h(t), c(t)
decoder_model = Model(
    [decoder_inputs_signal] + decoder_states_inputs,
    [decoder_outputs] + decoder_States
)

In [None]:
# map indexes back into real words
# so we can view the results
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_outputs.items()}

In [None]:
def decoder_sequences(input_seq):
  # encode the input as state vectors.
  states_value = encoder_model.predict(input_seq)

  # generate empty target sequence of length 1.
  target_seq = np.zeros((1, 1))

  # populate the first character of target sequence with the start character.
  # NOTE: tokenizer lower-cases all words
  target_seq[0, 0] = word2idx_outputs['<sos>']

  # if we get this we break
  eos = word2idx_outputs['<eos>']

  # create the translation
  output_sentence = []
  for _ in range(max_len_target):
    output_tokens, h, c = decoder_model.predict(
        [target_seq] + states_value
    )
    # output_tokens, h = decoder_model.predict( [target_seq] + states_value) # gru

    # get next word
    idx = np.argmax(output_tokens[0, 0, :])

    # End sentence of EOS
    if eos == idx:
      break

    word = ''
    if idx > 0:
      word = idx2word_trans[idx]
      output_sentence.append(word)

    # update the decoder input
    # which is just the word just generated 
    target_seq[0, 0] = idx

    # update states
    states_value = [h, c]
    # states_value = [h] # gru

    return ' '.join(output_sentence)

In [None]:
while True:
  # do some test translation
  i = np.random.chice(  len(input_texts)  )
  input_seq = encoder_inputs[i:i+1]
  translation = decode_sequence(input_seq)
  print('-')
  print('Input:', input_texts[i])
  print('translation:', translation)

  ans = input('Continue? [Y/n]')
  if ans and ans.lower().startswith('n'):
    break