In [4]:
import pandas as pd
import numpy as np
from time import time

import tensorflow.keras as keras
from keras import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import sparse_categorical_crossentropy

# Dataset

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/es_en.csv")
df.head()

Unnamed: 0,en,es
0,I hope you're not alone.,Espero que no estés solo.
1,"When I was taking a bath, the telephone rang.","Mientras me bañaba, sonó el teléfono."
2,I just need you to come with me.,Solo necesito que vengas conmigo.
3,Tom wondered how soon Mary would have dinner r...,Tom se preguntaba cuán pronto María tendría li...
4,Tom is waiting for an answer.,Tom está esperando una respuesta.


In [6]:
df.shape

(10000, 2)

In [7]:
# spanish

es_sentences = df.es.values
es_tokenizer = Tokenizer()
es_tokenizer.fit_on_texts(es_sentences)
es_sequences = es_tokenizer.texts_to_sequences(es_sentences)

In [10]:
# english

en_sentences = df.en.values
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en_sentences)
en_sequences = en_tokenizer.texts_to_sequences(en_sentences)

In [14]:
# para el padding

es_max_length = max([len(ss) for ss in es_sequences])
en_max_length = max([len(ss) for ss in en_sequences])

print(es_max_length)
print(en_max_length)

31
25


In [17]:
# vocabularios

es_vocab = len(es_tokenizer.word_index) + 1
en_vocab = len(en_tokenizer.word_index) + 1

print(es_vocab)
print(en_vocab)

7893
5053


In [18]:
# padding

es_sequences_padded = pad_sequences(es_sequences, maxlen=es_max_length, truncating='post')
en_sequences_padded = pad_sequences(en_sequences, maxlen=en_max_length, truncating='post')

# Model

In [21]:
keras.utils.set_random_seed(812)

model = Sequential([
    Embedding(
        input_dim=es_vocab,
        output_dim=128,
        input_length=es_max_length
    ),
    LSTM(64, return_sequences=False), # encoder
    RepeatVector(en_max_length),
    LSTM(64, return_sequences=True, dropout=.2), # decoder
    TimeDistributed(Dense(en_vocab, activation='softmax'))
])




In [25]:
model.compile(
    loss=sparse_categorical_crossentropy,
    optimizer=Adam(1e-3),
    metrics=['accuracy']
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 31, 128)           1010304   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVecto  (None, 25, 64)            0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 25, 64)            33024     
                                                                 
 time_distributed (TimeDist  (None, 25, 5053)          328445    
 ributed)                                                        
                                                                 
Total params: 1421181 (5.42 MB)
Trainable params: 142118

In [26]:
start = time()

n_epochs = 35

model.fit(es_sequences_padded, en_sequences_padded, epochs=n_epochs)

end = time()
print(f">>>>>>>> elapsed time: {(end-start)/60:.2f}m")

Epoch 1/35

Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
>>>>>>>> elapsed time: 12.49m


In [52]:
# model.save("model_efc.h5")

# Predicciones

In [61]:
from keras.models import load_model

model = load_model("model_seqseq2_500.h5")

In [64]:
ii = 652
ii = 876
ii = 45
ii = 666

print(es_sentences[ii])
print(en_sentences[ii])

# prediccion

preds = model.predict(es_sequences_padded[ii:ii+1])[0]
# preds
# np.argmax(preds, 1)
# [en_tokenizer.index_word[ww] for ww in np.argmax(preds, 1) if ww != 0]
' '.join([en_tokenizer.index_word[ww] for ww in np.argmax(preds, 1) if ww != 0])

He mirado por la ventana.
I've looked out the window.


"i've looked out the window"