In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, RepeatVector, Concatenate, Activation, Dot, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K

In [0]:
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e / s

In [0]:
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_SAMPLES = 10000
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 100
LATENT_DIM = 256

In [5]:
# Load the data
input_texts = []           # Sentence in original language
target_texts = []          # Sentence in target language
target_texts_inputs = []        # Sentence in target language with offset

count = 0

with open("drive//My Drive//data_science//chatbot_data//machine_translation//spa.txt", encoding="utf8") as f:
    for line in f:
        if line:
            count += 1
            if count > MAX_NUM_SAMPLES:
                break
                
            input_sentence, translation, _ = line.split("\t")
            
            target_sentence = translation + " <eos>"
            target_sentence_input = "<sos> " + translation
            
            input_texts.append(input_sentence)
            target_texts.append(target_sentence)
            target_texts_inputs.append(target_sentence_input)
            
print("Number of samples in data: ", len(input_texts))

Number of samples in data:  10000


In [6]:
# Tokenize the inputs
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

# Word to index, and index to word mapping
word2idx_inputs = tokenizer_inputs.word_index
idx2word_inputs = {v: k for k, v in word2idx_inputs.items()}

max_input_len = max([len(s) for s in input_sequences])

# Tokenize the outputs(keep sos and eos tokens, so dont filter)
tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters="")
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

# Word to index, and index to word mapping
word2idx_targets = tokenizer_outputs.word_index
idx2word_targets = {v: k for k, v in word2idx_targets.items()}

num_words_target = len(word2idx_targets) + 1

max_target_len = max([len(s) for s in target_sequences])

# Pad the sequences(we don't add zero padding at the end for encoder, as it helps decoder.)
encoder_inputs = pad_sequences(input_sequences, maxlen=max_input_len)
print("Encoder input data shape: ", encoder_inputs.shape)

decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_target_len, padding="post")
print("Decoder input data shape: ", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_target_len, padding="post")
print("Decoder target data shape: ", decoder_targets.shape)

Encoder input data shape:  (10000, 5)
Decoder input data shape:  (10000, 9)
Decoder target data shape:  (10000, 9)


In [7]:
# Load word vectors
print("Loading word vectors...")
word2vec = {}
with open("drive//My Drive//data_science//word_embeddings//glove.6B.{}d.txt".format(EMBEDDING_DIM), encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype="float32")
        word2vec[word] = vec
print("Loaded {} word vectors.".format(len(word2vec)))

# Prepare embeddings
print("Filling pre-trained embeddings...")
num_words_input = min(MAX_VOCAB_SIZE, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words_input, EMBEDDING_DIM))                # Shape -> (vocab_size, embedding_dim)
for word, i in word2idx_inputs.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)       # Returns None if not found
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
print("Filled {} pre-trained embeddings.".format(embedding_matrix.shape[0]))

Loading word vectors...
Loaded 400000 word vectors.
Filling pre-trained embeddings...
Filled 2351 pre-trained embeddings.


In [0]:
# One hot encode targets as we can't use sparse cross entropy when each sample has multiple targets
decoder_one_hot_targets = np.zeros((MAX_NUM_SAMPLES, max_target_len, num_words_target))
for i, decoder_target in enumerate(decoder_targets):
    for t, word in enumerate(decoder_target):
        if int(word) > 0:
            decoder_one_hot_targets[i, t, int(word)] = 1

In [0]:
# Creating the encoder model

# Inputs
encoder_inputs_model = Input(shape=(max_input_len, ))

# Model
encoder_embedding_layer = Embedding(num_words_input, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_input_len)
encoder_x = encoder_embedding_layer(encoder_inputs_model)
encoder_lstm = Bidirectional(LSTM(LATENT_DIM, return_sequences=True, dropout=0.5))
encoder_outputs = encoder_lstm(encoder_x)

In [0]:
# Creating the decoder model

# Inputs
decoder_inputs_model = Input(shape=(max_target_len, ))

# Model(Part - I)
decoder_embedding_layer = Embedding(num_words_target, EMBEDDING_DIM)
decoder_embeddings = decoder_embedding_layer(decoder_inputs_model)

# Attention
attn_repeat_layer = RepeatVector(max_input_len)
attn_concat_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10, activation="tanh")
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1)

def one_step_attention(h, st_1):
    # Copy state s(t-1) max_input_len times
    st_1 = attn_repeat_layer(st_1)
    # Concatenate all h(t)'s with s(t-1)
    x = attn_concat_layer([h, st_1])
    # Neural network layer 1
    x = attn_dense1(x)
    # Get alpha values
    alphas = attn_dense2(x)
    # Get context
    context = attn_dot([alphas, h])
    
    return context

# Model(Part - II)
decoder_lstm = LSTM(LATENT_DIM, return_state=True)
decoder_dense = Dense(num_words_target, activation="softmax")

initial_s = Input(shape=(LATENT_DIM, ), name="s0")
initial_c = Input(shape=(LATENT_DIM, ), name="c0")
context_last_word_concat_layer = Concatenate(axis=2)

s = initial_s
c = initial_c

outputs = []
for t in range(max_target_len):
    # Get the context using attention
    context = one_step_attention(encoder_outputs, s)
    # We need to select one word of input only, not the entire sequence of input
    selector = Lambda(lambda x: x[:, t:t+1])
    x_t = selector(decoder_embeddings)
    # Concatenate
    decoder_lstm_input = context_last_word_concat_layer([context, x_t])
    o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s, c])
    decoder_outputs = decoder_dense(o)
    
    outputs.append(decoder_outputs)
    
def stack_and_transpose(x):
    """Shape of x: [max_target_len, batch_size, num_words_target]. We need the batch_size first."""
    x = K.stack(x)
    x = K.permute_dimensions(x, pattern=(1, 0, 2))
    return x

stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

model = Model(inputs=[encoder_inputs_model, decoder_inputs_model, initial_s, initial_c], outputs=outputs)

In [0]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
z = np.zeros((len(encoder_inputs), LATENT_DIM))
result = model.fit([encoder_inputs, decoder_inputs, z, z], decoder_one_hot_targets, batch_size=BATCH_SIZE, 
                   epochs=EPOCHS, validation_split=VALIDATION_SPLIT)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [0]:
encoder_model = Model(encoder_inputs_model, encoder_outputs)
# Encoder hidden states
encoder_outputs_as_input = Input(shape=(max_input_len, LATENT_DIM * 2, ))

decoder_inputs_single = Input(shape=(1, ))
decoder_embeddings_input = decoder_embedding_layer(decoder_inputs_single)

# No need to loop through as this is done for 1 time step
context = one_step_attention(encoder_outputs_as_input, initial_s)

# Combine context with last word
decoder_lstm_input = context_last_word_concat_layer([context, decoder_embeddings_input])

o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)

decoder_model = Model(inputs=[decoder_inputs_single, encoder_outputs_as_input, initial_s, initial_c], 
                      outputs=[decoder_outputs, s, c])

In [0]:
def decode_sequence(input_seq):
    enc_hidden_states = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_targets["<sos>"]
    eos = word2idx_targets["<eos>"]

    s = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))

    output_sentence = []
    for _ in range(max_target_len):
        o, s, c = decoder_model.predict([target_seq[0], enc_hidden_states, s, c])
        idx = np.argmax(o.flatten())
        if idx == eos:
            break
        word = ""
        if idx > 0:
            word = idx2word_targets[idx]
            output_sentence.append(word)
        target_seq[0, 0] = idx
        
    return " ".join(output_sentence)

In [52]:
input_sentence_original = ["I am fine.", "I am happy.", "Sure."]
input_sentence = [re.findall(r"[A-Za-z]+", sent) for sent in input_sentence_original]

for i, sent in enumerate(input_sentence):
    encoded_sentence = []
    count = 0
    for word in sent:
        if count > max_input_len:
            break
        if word in word2idx_inputs:
            idx = word2idx_inputs[word]
            encoded_sentence.append(idx)
            count += 1

    if count < max_input_len:
        difference = max_input_len - count 
        for _ in range(difference):
            encoded_sentence.insert(0, 0)
    print("Original sentence: ", input_sentence_original[i])
    print("Predicted translation: ", decode_sequence(np.array(encoded_sentence).reshape((1, max_input_len))))

Original sentence:  I am fine.
Predicted translation:  estoy bien.
Original sentence:  I am happy.
Predicted translation:  estoy feliz.
Original sentence:  Sure.
Predicted translation:  ¡órale!
