In [None]:
import string
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense,Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_data = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip



In [None]:
translation_file = open(path_to_data,"r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()

In [None]:
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:20000]

In [None]:
def clean_sentence(sentence):
  # Lower case the sentence
  lower_case_sent = sentence.lower() # Strip punctuation
  string_punctuation = string.punctuation + "¡" + '¿'
  clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
  return clean_sentence

def tokenize(sentences): # Create tokenizer
  text_tokenizer = Tokenizer() # Fit texts
  text_tokenizer.fit_on_texts(sentences)
  return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

In [None]:
# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

Maximum length spanish sentence: 12

Maximum length english sentence: 6


In [None]:
# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Spanish vocabulary is of 7239 unique words

English vocabulary is of 3804 unique words


In [None]:
max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

In [None]:
# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

In [None]:
# input_sequence = Input(shape=(max_spanish_len,))
# embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)

In [None]:
# input_sequence = Input(shape=(max_spanish_len,))
# embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
# encoder = LSTM(64, return_sequences=False)(embedding)

In [None]:
# input_sequence = Input(shape=(max_spanish_len,))
# embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
# encoder = LSTM(64, return_sequences=False)(embedding)
# r_vec = RepeatVector(max_english_len)(encoder)

In [None]:
# input_sequence = Input(shape=(max_spanish_len,))
# embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
# encoder = LSTM(64, return_sequences=False)(embedding)
# r_vec = RepeatVector(max_english_len)(encoder)
# decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)

In [None]:
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(1e-3), metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"

_________________________________________________________________

 Layer (type)                Output Shape              Param #   


 input_1 (InputLayer)        [(None, 12)]              0         

                                                                 

 embedding (Embedding)       (None, 12, 128)           926592    

                                                                 

 lstm (LSTM)                 (None, 64)                49408     

                                                                 

 repeat_vector (RepeatVecto  (None, 6, 64)             0         

 r)                                                              

                                                                 

 lstm_1 (LSTM)               (None, 6, 64)             33024     

                                                                 

 time_distributed (TimeDist  (None, 6, 3804)           247260    

 ributed)                                    

In [None]:
enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, epochs=50)

Epoch 1/50


Epoch 2/50


Epoch 3/50


Epoch 4/50


Epoch 5/50


Epoch 6/50


Epoch 7/50


Epoch 8/50


Epoch 9/50


Epoch 10/50


Epoch 11/50


Epoch 12/50


Epoch 13/50


Epoch 14/50


Epoch 15/50


Epoch 16/50


Epoch 17/50


Epoch 18/50


Epoch 19/50


Epoch 20/50


Epoch 21/50


Epoch 22/50


Epoch 23/50


Epoch 24/50


Epoch 25/50


Epoch 26/50


Epoch 27/50


Epoch 28/50


Epoch 29/50


Epoch 30/50


Epoch 31/50


Epoch 32/50


Epoch 33/50


Epoch 34/50


Epoch 35/50


Epoch 36/50


Epoch 37/50


Epoch 38/50


Epoch 39/50


Epoch 40/50


Epoch 41/50


Epoch 42/50


Epoch 43/50


Epoch 44/50


Epoch 45/50


Epoch 46/50


Epoch 47/50


Epoch 48/50


Epoch 49/50


Epoch 50/50



<keras.src.callbacks.History at 0x7ca0e81f4ac0>

In [None]:
def logits_to_sentence(logits, tokenizer):
  index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
  index_to_words[0] = '<empty>'
  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

indexes = [1, 17]
for index in indexes:
  print("The english sentence is: {}".format(english_sentences[index]))
  print("The spanish sentence is: {}".format(spanish_sentences[index]))
  print('The predicted sentence is :')
  print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))
  print()

The english sentence is: he is here

The spanish sentence is: él está aquí

The predicted sentence is :


he is here <empty> <empty> <empty>



The english sentence is: hes smart

The spanish sentence is: él es inteligente

The predicted sentence is :


hes intelligent <empty> <empty> <empty> <empty>


