In [1]:
#En este archivo vamos a definir las diferentes funciones que vamos a usar en los diferentes modelos

#---Imports----
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers as layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import random
from unicodedata import normalize
import string
import os.path


#---Definicion de parametros---
batch_size = 128
epochs = 5
num_sentences = 1000
latent_dim = 128
hidden_units = 128
model_path_lstm = "lstm_model.h5"
model_path_gru_embedding = "gru_embedding.h5"
model_path_gru_inference = "gru_inference.h5"
es_dataset = "../dataset/es-en/europarl-v7.es-en.es"
en_dataset = "../dataset/es-en/europarl-v7.es-en.en"
es_tr_dataset = "../dataset/es-en/es_tr_dataset.es"
es_test_dataset = "../dataset/es-en/es_test_dataset.es"
en_tr_dataset = "../dataset/es-en/en_tr_dataset.en"
en_test_dataset = "../dataset/es-en/en_test_dataset.en"
en_vocab_max = 100
es_vocab_max = 100
oov = "Unkn"
perc_train = 0.8

#---Definicion de funciones---

#-------Leer datasets---------
def cleantexts(texts):
    text_new = list()
    for line in texts:
        line = normalize("NFD", line).encode("ascii","ignore")
        line = line.decode("UTF-8")
        
        #Vamos a añadir el token de principio y final de frase.
        #line = "".join(["sos ",line," eos"])
        line = line.lower()
        line = line.translate(line.maketrans('', '', string.punctuation))
        text_new.append(line)
    return text_new

def read_datasets():
    
    f_en = open(en_dataset, "r")
    f_es = open(es_dataset, "r")
    es_texts = f_es.readlines()
    en_texts = f_en.readlines()
    index = random.sample(range(0, len(es_texts)),num_sentences)
    en_texts = [en_texts[ind] for ind in index]
    es_texts = [es_texts[ind] for ind in index]
    f_es.close()
    f_en.close()
    return(es_texts, en_texts)
    f_es.close()
    f_en.close()
    return(es_texts, en_texts)

def init_data():
    if(os.path.isfile(es_tr_dataset) and os.path.isfile(en_tr_dataset) and os.path.isfile(es_test_dataset) and os.path.isfile(en_test_dataset)):
        f_tr_es = open(es_tr_dataset,"r")
        f_tr_en = open(en_tr_dataset, "r")
        f_test_es = open(es_test_dataset,"r")
        f_test_en = open(en_test_dataset,"r")
        es_train = f_tr_es.readlines()
        en_train = f_tr_en.readlines()
        es_test = f_test_es.readlines()
        en_test = f_test_en.readlines()
        
    else:
        es_texts, en_texts = read_datasets()
        es_texts = cleantexts(es_texts)
        en_texts = cleantexts(en_texts)
        es_train,es_test,en_train,en_test = split_dataset(es_texts,en_texts)
        f_tr_es = open(es_tr_dataset,"w")
        f_tr_en = open(en_tr_dataset, "w")
        f_test_es = open(es_test_dataset,"w")
        f_test_en = open(en_test_dataset,"w")
        f_tr_es.writelines(es_train)
        f_tr_en.writelines(en_train)
        f_test_es.writelines(es_test)
        f_test_en.writelines(en_test)
    es_texts = es_train
    es_texts.extend(es_test)
    en_texts = en_train
    en_texts.extend(en_test)
        
    return(es_train, en_train, es_test,en_test,es_texts,en_texts)

def split_dataset(en_texts, es_texts):
    train_num = int(num_sentences * perc_train)
    test_num = int(num_sentences - train_num)
    index = np.arange(len(es_texts))
    np.random.shuffle(index)
    train_index , test_index = index[:train_num], index[train_num:train_num+test_num]
    
    en_train = [en_texts[ind] for ind in train_index]
    en_test = [en_texts[ind] for ind in test_index]
    es_train =[es_texts[ind] for ind in train_index]
    es_test =[es_texts[ind] for ind in test_index]
    
    return(es_train, es_test, en_train,en_test)


#-------Tokenizar---------

def init_tokens(en_texts,es_texts):
    en_tok = Tokenizer(num_words=en_vocab_max, oov_token = oov)
    es_tok = Tokenizer(num_words=es_vocab_max, oov_token = oov)
    en_tok.fit_on_texts(en_texts)
    es_tok.fit_on_texts(es_texts)
    en_len = max(len(line.split()) for line in en_texts)
    es_len = max(len(line.split()) for line in es_texts)
    en_vocab = len(en_tok.word_index)+1
    es_vocab = len(es_tok.word_index)+1
    return(es_tok,en_tok,es_len,en_len,es_vocab,en_vocab)


#-------Crear modelos-------

#MODELO GRU CON INFERENCIA
def create_model_inference_train(es_len, en_len, es_vocab, en_vocab):
    #Modelo con inferencia 
    #-----
    #Encoder
    #-----
    #Definimos la capa "input" para nuestro encoder
    es_input_layer = layers.Input(shape=(es_len, es_vocab))

    #Definimos la capa "Gru" para el encoder, tendremos que definir el tamaño (numero de neuronas)de la capa oculta 
    #que hemos ido probando a base de prueba y error
    es_gru_layer = layers.GRU(hidden_units, return_state = True)

    #Obtenemos la salida del encoder y el estado:
    es_output, es_state = es_gru_layer(es_input_layer)

    #-----
    #Decoder
    #-----
    #Definimos la capa de entrada "input" del decoder
    en_input_layer = layers.Input(shape=(en_len-1, en_vocab))
    en_gru_layer = layers.GRU(hidden_units, return_sequences = True)
    #Obtenemos la salida del decoder
    en_output = en_gru_layer(en_input_layer, initial_state = es_state)

    #Definimos una capa "TimeDistributed" con otra capa "Dense"
    en_dense_layer = layers.TimeDistributed(layers.Dense(en_vocab,activation="softmax"))
    en_prediction = en_dense_layer(en_output)

    #-----
    #Creamos el modelo
    #-----
    model = Model(inputs=[es_input_layer, en_input_layer], outputs = en_prediction)
    return(model)

#MODELO GRU CON EMBEDDING
def create_model_gru_embedding(es_len, en_len, es_vocab, en_vocab):
    #-----
    #Encoder
    #-----
    #Vamos a definir la capa de entrada, sin definir el batch_size
    es_input_layer = layers.Input(shape=(es_len,))
    #Vamos a definir la capa de embedding
    es_embedding_layer = layers.Embedding(es_vocab, latent_dim,input_length=es_len)(es_input_layer)
    #Definimos la salida y el estado:
    es_output, es_output_state = layers.GRU(hidden_units, return_state = True)(es_embedding_layer)

    #-----
    #Decoder
    #-----
    #Definimos la capa de entrada "input" del decoder que acepta un solo onehot vector
    en_input_layer = layers.Input(shape=(en_len-1,))
    #Definimos en el decoder una campa embedding tambien que aceptara la entrada del decoder
    en_embedding_layer = layers.Embedding(en_vocab, latent_dim, input_length=en_len-1)(en_input_layer)
    en_output, _= layers.GRU(hidden_units, return_state = True, return_sequences = True)(en_embedding_layer, initial_state = es_output_state)

    #Creamos una ultima capa que contiene 2 capas, una "TimeDistributed" y una "Dense"
    en_prediction = layers.TimeDistributed(layers.Dense(en_vocab, activation = "softmax"))(en_output)

    #Definimos el modelo del decoder
    model = Model([es_input_layer,en_input_layer], en_prediction)
    return(model)

#MODELO LSTM CON EMBEDDING
def create_model_lstm(es_len, en_len,es_vocab,en_vocab):
    '''
    #-----
    #Encoder
    #-----
    es_input_layer = layers.Input(shape=(es_len,))
    es_embedding_layer = layers.Embedding(es_vocab, latent_dim,input_length=es_len)(es_input_layer)
    es_output,state_h,state_c = layers.LSTM(latent_dim,return_state=True)(es_embedding_layer)
    
    #-----
    #Decoder
    #-----
    en_input_layer = layers.Input(shape=(en_len-1,))
    en_embedding_layer = layers.Embedding(en_vocab, latent_dim, input_length=en_len-1)(en_input_layer)
    en_lstm_layer = layers.LSTM(latent_dim, return_sequences = True)
    en_output = en_lstm_layer(en_embedding_layer,initial_state=[state_h,state_c])
    en_dense_layer = layers.TimeDistributed(layers.Dense(en_vocab,activation="softmax"))
    en_prediction=  en_dense_layer(en_output)
    model = Model([es_input_layer,en_input_layer], en_prediction)
    return(model)
    '''
    #---Encoder---
    es_input_layer = layers.Input(shape=(None,))
    es_embedding_layer = layers.Embedding(es_vocab, latent_dim, input_length=es_len)(es_input_layer)
    es_output, state_h, state_c = layers.LSTM(latent_dim, return_state=True)(es_embedding_layer)
    
    #----Decoder--
    en_input_layer = layers.Input(shape=(None,))
    en_embedding_layer = layers.Embedding(en_vocab,latent_dim)(en_input_layer)
    en_output = layers.LSTM(latent_dim, return_sequences = True)(en_embedding_layer, initial_state=[state_h,state_c])
    en_prediction = layers.TimeDistributed(layers.Dense(en_vocab,activation="softmax"))(en_output)
    return(Model([es_input_layer,en_input_layer], en_prediction))
    

#----Entrenar Modelo----
def train_inference_model():
    
    tr_es_x = sentences_to_sequences(es_tok, es_train, es_len,es_vocab)
    tr_en_xy = sentences_to_sequences(en_tok, en_train, en_len, en_vocab, reverse = False)
    tr_en_x = tr_en_xy[:,:-1,:]
    tr_en_y = tr_en_xy[:,1:,:]
    test_es_x = sentences_to_sequences(es_tok, es_test, es_len, es_vocab)
    test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,reverse=False)
    test_en_x = test_en_xy[:,:-1,:]
    test_en_y = test_en_xy[:,1:,:]
    checkpoint = ModelCheckpoint(model_path_gru_inference, monitor ="val_loss", save_best_only=True)
    model.fit([tr_es_x, tr_en_x], tr_en_y,batch_size=batch_size,epochs=epochs, validation_data=([test_es_x,test_en_x],test_en_y), callbacks = [checkpoint])
    
    '''
    best = 0
    for i in range(epochs):
        for j in range(0,len(es_train),batch_size):
            #Definimos el primer input, la entrada en español:
            tr_es_x = sentences_to_sequences(es_tok, es_train[j:j+batch_size], es_len,es_vocab)
            #Definimos el input en ingles junto al output
            tr_en_xy = sentences_to_sequences(en_tok, en_train[j:j+batch_size], en_len, en_vocab, reverse = False)
            #Tenemos que serparar la entrada de la salida
            tr_en_x = tr_en_xy[:,:-1,:]
            tr_en_y = tr_en_xy[:,1:,:]
            #Entrenamos el modelo para este conjunto.
            model.train_on_batch([tr_es_x,tr_en_x],tr_en_y)
        #Obtenemos los inputs y output para validar el modelo en este epoch
        test_es_x = sentences_to_sequences(es_tok, es_test, es_len, es_vocab)
        test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,reverse=False)
        #Separamos la salida y la entrada
        test_en_x = test_en_xy[:,:-1,:]
        test_en_y = test_en_xy[:,1:,:]
        #Validamos el modelo
        ev = model.evaluate([test_es_x,test_en_x], test_en_y, batch_size=test_num,verbose=0)
        print("Loss, Acc", ev)
        if(ev[0]<best or best == 0):
            model.save("inference_model.h5")
            best = ev[0]
    '''

def train_gru_embedding_model():

    tr_es_x = sentences_to_sequences(es_tok, es_train, es_len,es_vocab, onehot=False)
    #Definimos el input en ingles junto al output
    tr_en_xy = sentences_to_sequences(en_tok, en_train, en_len, en_vocab,onehot=False, reverse = False)
    #Tenemos que serparar la entrada de la salida
    tr_en_x = tr_en_xy[:,:-1]
    #Para la salida tenemos que transformarla en onehot vector
    tr_en_xy_onehot = sentences_to_sequences(en_tok, en_train, en_len, en_vocab, reverse = False)
    tr_en_y = tr_en_xy_onehot[:,1:,:]
    test_es_x = sentences_to_sequences(es_tok, es_test, es_len, es_vocab,onehot=False)
    test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,onehot=False,reverse=False)
    #Separamos la salida y la entrada
    test_en_x = test_en_xy[:,:-1]
    test_en_xy_onehot = test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,reverse=False)
    test_en_y = test_en_xy_onehot[:,1:,:]
    checkpoint = ModelCheckpoint(model_path_gru_embedding, monitor ="val_loss", save_best_only=True) 
    model.fit([tr_es_x, tr_en_x], tr_en_y,batch_size=batch_size,epochs=epochs, validation_data=([test_es_x,test_en_x],test_en_y), callbacks=[checkpoint])
    '''
    best = 0
    for i in range(epochs):
        for j in range(0,len(es_train),batch_size):
            #Definimos el primer input, la entrada en español:
            tr_es_x = sentences_to_sequences(es_tok, es_train[j:j+batch_size], es_len,es_vocab, onehot=False)
            #Definimos el input en ingles junto al output
            tr_en_xy = sentences_to_sequences(en_tok, en_train[j:j+batch_size], en_len, en_vocab,onehot=False, reverse = False)
            #Tenemos que serparar la entrada de la salida
            tr_en_x = tr_en_xy[:,:-1]
            #Para la salida tenemos que transformarla en onehot vector
            tr_en_xy_onehot = sentences_to_sequences(en_tok, en_train[j:j+batch_size], en_len, en_vocab, reverse = False)
            tr_en_y = tr_en_xy_onehot[:,1:,:]
            model.train_on_batch([tr_es_x,tr_en_x],tr_en_y)
        #Obtenemos los inputs y output para validar el modelo en este epoch
        test_es_x = sentences_to_sequences(es_tok, es_test, es_len, es_vocab,onehot=False)
        test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,onehot=False,reverse=False)
        #Separamos la salida y la entrada
        test_en_x = test_en_xy[:,:-1]
        test_en_xy_onehot = test_en_xy = sentences_to_sequences(en_tok, en_test, en_len, en_vocab,reverse=False)
        test_en_y = test_en_xy_onehot[:,1:,:]
        #Validamos el modelo
        ev = model.evaluate([test_es_x,test_en_x], test_en_y, batch_size=test_num,verbose=0)
        print("Loss, Acc", ev)
        if(ev[0]<best or best == 0):
            model.save("gru_embedding_model.h5")
            best = ev[0]
    '''
            
def train_lstm_model():
    tr_es_x = sentences_to_sequences(es_tok, es_train, es_len,es_vocab,onehot=False)
    tr_en_x = sentences_to_sequences(en_tok, en_train, en_len, en_vocab, onehot=False, reverse = False)
    tr_en_x = tr_en_x[:,:-1]
    tr_en_y = sentences_to_sequences(en_tok, en_train, en_len, en_vocab, reverse = False)
    tr_en_y = tr_en_y[:,1:,:]
    
    test_es_x = sentences_to_sequences(es_tok, es_test, es_len,es_vocab,onehot=False)
    test_en_x = sentences_to_sequences(en_tok, en_test, en_len, en_vocab, onehot=False, reverse = False)
    test_en_x = test_en_x[:,:-1]
    test_en_y = sentences_to_sequences(en_tok, en_test, en_len, en_vocab, reverse = False)
    test_en_y = test_en_y[:,1:,:]
    
    checkpoint = ModelCheckpoint(model_path_lstm, monitor ="val_loss", save_best_only=True)
    model.fit([tr_es_x, tr_en_x], tr_en_y,batch_size=batch_size,epochs=epochs, validation_data=([test_es_x,test_en_x],test_en_y), callbacks=[checkpoint])
    
    
#---------Utils---------

#Definimos una funcion que transformara una frase en una secuencia(vector de ids de la frase)
#Se le aplicara un padding a la secuencia y si se quiere que sea un vector onehot se le aplicara la funcion
#to_categorical dada por keras
#Tendra la opcion de modificar el tipo de padding a pre o post
#Modificar si quiere que sea onehot o no
#Cambiar si se inivierte la frase
def sentences_to_sequences(tokenizer,sentence,length,vocab_size, onehot=True, pad_type = "post", reverse = True):
    seqs = tokenizer.texts_to_sequences(sentence)
    seqs = pad_sequences(seqs, padding = pad_type, truncating="post", maxlen = length)
    if reverse:
        seqs = seqs[:,::-1]
    if onehot:
        seqs = to_categorical(seqs,num_classes = vocab_size)
    return seqs

    