# Proyecto final

In [None]:
import os
import numpy as np
from os import listdir
from os.path import isfile, join
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers
import pickle
from sklearn.model_selection import train_test_split #particiones

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
def get_txt(path):
    """
    Regresa una lista con el contenido de todos los archivos de un directorio

    Args:
        path (str): ruta de la carpeta
    """
    text = []
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    
    for file in onlyfiles:
        with open(path+"/"+file, 'r') as f:
            text += f.readlines()
    return text

# Guardamos cada película en un diccionario
# cada entrada del diccionario es una lista con las peliculas leídas
corpus = []
corpus += get_txt("../corpus/Pride & Prejudice")
corpus += get_txt("../corpus/Marvel")
corpus += get_txt("../corpus/Christopher Nolan")
corpus

In [None]:
len(corpus)

In [None]:
# Obtenemos el tamaño de la oracion mas grande

def get_longest_sentence(corpus):
    largest = []
    for sentence in corpus:
        if len(sentence) > len(largest):
            largest = sentence
    return largest

largest = get_longest_sentence(corpus)
print(largest)
print(len(largest.split(' ')))

In [None]:
# Particiones 
train, test = train_test_split(corpus, test_size=0.3)
print('Número de cadenas train:',len(train))
print('Número de cadenas test:',len(test))
pickle.dump(train, open('./pickles/datasets/train.pkl','wb'))
pickle.dump(test, open('./pickles/datasets/test.pkl','wb'))

In [None]:
train = pickle.load(open('./pickles/datasets/train.pkl','rb'))
test = pickle.load(open('./pickles/datasets/test.pkl','rb'))

print('Número de cadenas train:',len(train))
print('Número de cadenas test:',len(test))
print(train[:3])

In [None]:
tokenizer = tf_text.UnicodeScriptTokenizer()
movies_tokens =  tokenizer.tokenize([' '.join(train)]).to_list()[0]
movies_tokens[:10]

In [None]:
words_ds = tf.data.Dataset.from_tensor_slices(movies_tokens)

In [None]:
for words in words_ds.take(20):
    print(words.numpy())

In [None]:
seq_length = 21
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

In [None]:
words_batches

In [None]:
def join_strings(tokens):
    return tf.strings.reduce_join(tokens, axis=0, separator=' ')

In [None]:
words_batches

In [None]:
raw_train_ds = words_batches.map(join_strings)

In [None]:
raw_train_ds

In [None]:
raw_train_ds = tf.data.Dataset.from_tensor_slices(np.array(train).flatten())
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))


In [None]:
raw_train_ds

In [None]:
train

In [None]:
for batch in raw_train_ds.take(1):
    print(batch)

In [None]:
voc_size = 20406

def clean_text(raw_text):
    #lowercase = tf.strings.lower(raw_text)
    #trim = tf.strings.strip(lowercase)
    #clean = tf.strings.regex_replace(trim, '\n', ' ')
    clean = tf.strings.unicode_decode(raw_text, 'utf-8', errors='ignore')
    return clean


vectorize_layer = layers.TextVectorization(
    standardize=clean_text,
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=21,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds, 32)
vocab = vectorize_layer.get_vocabulary()
voc_size = len(vocab)
voc_size

In [None]:
train

In [None]:
vectorize_layer(['Love you', '3 millions'])

In [None]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [None]:
train_ds = raw_train_ds.map(get_input_target)

In [None]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])

Definir modelo

In [None]:
emb_dim = 256
model_dim = 1024

In [None]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [None]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

In [None]:
model.summary()

In [None]:
predictions[0].shape

In [None]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

Obtener palabras a travez de indices con vocab

In [None]:
' '.join([vocab[_] for _ in input_batch[0]])

In [None]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

# Entrenamiento

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [None]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [None]:
epochs = 1

In [None]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

# Guardamos el modelo

In [None]:
model.save('./modelTensor/model_RNN.h5')

In [None]:
model2 = tf.keras.models.load_model('./modelTensor/model_RNN')

# Generación

In [None]:
states = None
start = 'tony stark'
context = tf.constant([start])
output = [start]

for i in range(50):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)