# Red neuronal Sequence to Sequence

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import string
import pickle
from collections import Counter
from os import listdir
from os.path import isfile, join
import re
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split #particiones
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import matplotlib.pyplot as plt

AUTOTUNE = tf.data.experimental.AUTOTUNE

### Preparamos el dataset

Para esta red se limpio y tokeniza el corpus pero dejamos signos de puntuación, además generamos pares de diálogos

In [None]:
def prepare_dataset():
    """
    Genera pares de dialogos
    """
    pairs = []
    onlyfiles = []
    paths = ['../corpus/Marvel', '../corpus/Christopher Nolan']
    for path in paths:
        onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    
        for file in onlyfiles:
            with open(path+"/"+file, 'rb') as f:
                corpus = f.read().decode('utf-8', 'replace')
                # quitamos #NAME? y texto descriptivo
                corpus = re.sub("#NAME\?|\[.*\]", "", corpus)
                # Rompemos dialogos por saltos de linea y giones
                corpus = re.split("\n|\s-\s", corpus)
                # quitamos espacios extra
                corpus = list(map(lambda x: x.strip(), corpus))
                # quitamos lineas vacias
                corpus = list(filter(lambda x: x != '', corpus))
                # formamos pares
                pairs += list(zip(corpus, list(map(lambda x: '[start] '+x+' [end]', corpus[1:]))))
    
    return pairs

text_pairs = prepare_dataset()
text_pairs[:5]

Dividimos en conjunto de entrenamiento, validación y prueba

In [None]:
aux_pairs, test_pairs = train_test_split(text_pairs, test_size=0.3)
train_pairs, val_pairs = train_test_split(aux_pairs, test_size=0.1)

print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

Guardamos y cargamos los conjuntos 

In [None]:
pickle.dump(train_pairs, open('./pickles/seqtoseq/train_pairs.pkl', 'wb'))
pickle.dump(val_pairs, open('./pickles/seqtoseq/val_pairs.pkl', 'wb'))
pickle.dump(test_pairs, open('./pickles/seqtoseq/test_pairs.pkl', 'wb'))

In [None]:
train_pairs = pickle.load(open('./pickles/seqtoseq/train_pairs.pkl','rb'))
val_pairs = pickle.load(open('./pickles/seqtoseq/val_pairs.pkl','rb'))
test_pairs = pickle.load(open('./pickles/seqtoseq/test_pairs.pkl','rb'))

Obtenemos el tamaño del vocabulario

In [None]:
def get_vocab_size(pairs):
    corpus = ' '.join([sentence for pair in pairs for sentence in pair])
    return len(Counter(corpus.split(' ')))

get_vocab_size(text_pairs)

Creamos una función para normalizar la entrada

In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
strip_chars

vocab_size = 29948
sequence_length = 10
batch_size = 32

def custom_standardization(input_string):
    # primero pasamos la cadena a minúsculas
    lowercase = tf.strings.lower(input_string)
    # luego le quitamos signos de puntuación y admiración
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

Como en este caso tenemos 2 redes (encoder y decoder) necesitamos 2 diferentes capas para vectorizar, es decir que el texto de entrada se limpia y se convierte en un vector que lo representa, además se adapta esta capa al conjunto de entrenamiento

In [None]:
before_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", 
    output_sequence_length=sequence_length,
)
after_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization,
)
train_before_texts = [pair[0] for pair in train_pairs]
train_after_texts = [pair[1] for pair in train_pairs]
before_vectorization.adapt(train_before_texts)
after_vectorization.adapt(train_after_texts)

In [None]:
before_vectorization([['Good God'], ['Tony Stark']])

Creamos los vectores y los bloques de datos para entrenar la red

In [None]:
def format_dataset(eng, spa):
    """
    Los pares que dialogós que inicialmente teníamos los repartimos 
    eng - el primer diálogo
    spa - el diálogo de respuesta
    """
    eng = before_vectorization(eng)
    spa = after_vectorization(spa)
    return eng, spa


def make_dataset(pairs):
    """
    Bloques de datos aleatorios
    """
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(AUTOTUNE).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
for inputs, targets in train_ds.take(1):
    print(inputs[0], targets[0])

## Modelo

In [None]:
emb_dim = 256
model_dim = 1024

### Encoder

Generamos la primera red, la que toma la primera oración y la codifica para ser la entrada de la segunda red

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(voc_size,
                                                   emb_dim)
        self.gru = tf.keras.layers.GRU(model_dim,
                                       return_sequences=False,
                                       return_state=True)

    def call(self, x, state=None):
        x = self.embedding(x)
        x, state = self.gru(x, initial_state=state)
        return x, state
    
    
encoder = Encoder(before_vectorization.vocabulary_size(),
                  emb_dim, model_dim)
output, enc_state = encoder(inputs)
enc_state

In [None]:
encoder.summary()

In [None]:
targets[:, :1].shape

### Decoder

Generamos la segunda red, la que toma el vector de salida y genera el vector que pertenece al diálogo de respuesta

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, x, state, training=False):
        x = self.embedding(x, training=training)
        x, state = self.gru(x, initial_state=state, training=training)
        x = self.logits(x, training=training)

        return x, state


decoder = Decoder(voc_size=after_vectorization.vocabulary_size(),
                  emb_dim=emb_dim,
                  model_dim=model_dim)

decoder(targets[:, :1], enc_state)

In [None]:
decoder.summary()

## Entrenamiento

In [None]:
opt = tf.keras.optimizers.Adam(0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
_, state = encoder(inputs)
state

In [None]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [None]:
for inputs, targets in train_ds.take(1):
    print(inputs[:3], targets[:3])

In [None]:
inputs[:, 1]

In [None]:
@tf.function
def train_step(inp_batch, tar_batch):
    loss = tf.constant(0.0)

    with tf.GradientTape() as tape:
        _, state = encoder(inp_batch, training=True)

        for step in range(0, tar_batch.shape[1] - 1):
            dec_inp = tf.expand_dims(tar_batch[:, step], 1)
            pred, state = decoder(dec_inp, state, 
                                  training=True)
            loss += loss_function(tar_batch[:, step + 1], pred)
        total_loss = loss / tar_batch.shape[1]
    weights = encoder.trainable_weights + decoder.trainable_weights
    gradients = tape.gradient(total_loss, weights)   
    opt.apply_gradients(zip(gradients, weights))
    train_loss_avg(total_loss)

@tf.function
def test_step(inp_batch, tar_batch):
    loss = tf.constant(0.0)

    with tf.GradientTape() as tape:
        _, state = encoder(inp_batch, training=True)

        for step in range(0, tar_batch.shape[1] - 1):
            dec_inp = tf.expand_dims(tar_batch[:, step], 1)
            pred, state = decoder(dec_inp, state, 
                                  training=True)
            loss += loss_function(tar_batch[:, step + 1], pred)
        total_loss = loss / tar_batch.shape[1]
    
    val_loss_avg(total_loss)

Para guardar

In [None]:
ckpt_encoder = tf.train.Checkpoint(encoder)
ckpt_encoder_manager = tf.train.CheckpointManager(
    ckpt_encoder, 
    directory="./seqtoseqModelCheckpoint/encoder/", 
    max_to_keep=1
)

ckpt_decoder = tf.train.Checkpoint(decoder)
ckpt_decoder_manager = tf.train.CheckpointManager(
    ckpt_decoder, 
    directory="./seqtoseqModelCheckpoint/decoder/", 
    max_to_keep=1
)

In [None]:
epochs = 1

train_loss_history = []
val_loss_history = []

Entrenamos

In [None]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_history.append(train_loss_avg.result().numpy())
    train_loss_avg.reset_states()
    ckpt_encoder_manager.save(int(epoch))
    ckpt_decoder_manager.save(int(epoch))
    
    for text, target in val_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_history.append(val_loss_avg.result().numpy())
    val_loss_avg.reset_states()



In [None]:
plt.plot(train_loss_history)
plt.plot(val_loss_history)
plt.title('train and validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.savefig('training_validation_loss.png')
plt.show()

## Probar la red

In [None]:
after_vocab = after_vectorization.get_vocabulary()
after_index_lookup = dict(zip(range(len(after_vocab)), after_vocab))

In [None]:
inp = before_vectorization(['i love my dog'])
_, state = encoder(inp)
dec_inp = after_vectorization(['[start]'])[:, :1]
output = []
pred_index = ''

while pred_index != '[end]':
    pred, state = decoder(dec_inp, state, training=False)
    dec_inp = tf.argmax(pred, axis=-1)
    pred_index = after_index_lookup[dec_inp[0][0].numpy()]
    output.append(pred_index)
    
' '.join(output[:-1])