# Proyecto final

In [19]:
import os
import numpy as np
from os import listdir
from os.path import isfile, join
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers
import re #regex
import pickle
import tqdm
from sklearn.model_selection import train_test_split #particiones

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [45]:
def get_txt(path):
    """
    Regresa una lista con el contenido de todos los archivos de un directorio

    Args:
        path (str): ruta de la carpeta
    """
    text = []
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    
    for file in onlyfiles:
        with open(path+"/"+file, 'r') as f:
            text += f.readlines()
    return text

# Guardamos cada película en un diccionario
# cada entrada del diccionario es una lista con las peliculas leídas
corpus = []
corpus += get_txt("../corpus/Pride & Prejudice")
corpus += get_txt("../corpus/Marvel")
corpus += get_txt("../corpus/Christopher Nolan")
corpus

['        1\n',
 '\n',
 '      It is a truth universally acknowledged, that a single man in\n',
 '      possession of a good fortune, must be in want of a wife.\n',
 '\n',
 '      However little known the feelings or views of such a man may be\n',
 '      on his first entering a neighbourhood, this truth is so well\n',
 '      fixed in the minds of the surrounding families, that he is\n',
 '      considered the rightful property of some one or other of their\n',
 '      daughters.\n',
 '\n',
 '      "My dear Mr. Bennet," said his lady to him one day, "have you\n',
 '      heard that Netherfield Park is let at last?"\n',
 '\n',
 '      Mr. Bennet replied that he had not.\n',
 '\n',
 '      "But it is," returned she; "for Mrs. Long has just been here, and\n',
 '      she told me all about it."\n',
 '\n',
 '      Mr. Bennet made no answer.\n',
 '\n',
 '      "Do you not want to know who has taken it?" cried his wife\n',
 '      impatiently.\n',
 '\n',
 '      "_You_ want to tell me, and I

In [33]:
len(corpus)

59212

In [48]:
def get_clean(corpus):
    clean = []
    pattern = r'[^a-z0-9 .,!;:]'
    for w in corpus:
        #convierte a minúsculas
        w = w.lower()
        w = re.sub(pattern,'', w)
        if w != '' or w == '\n':
            clean.append(w.strip())
    return clean

clean = get_clean(corpus)
clean

['1',
 'it is a truth universally acknowledged, that a single man in',
 'possession of a good fortune, must be in want of a wife.',
 'however little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood, this truth is so well',
 'fixed in the minds of the surrounding families, that he is',
 'considered the rightful property of some one or other of their',
 'daughters.',
 'my dear mr. bennet, said his lady to him one day, have you',
 'heard that netherfield park is let at last',
 'mr. bennet replied that he had not.',
 'but it is, returned she; for mrs. long has just been here, and',
 'she told me all about it.',
 'mr. bennet made no answer.',
 'do you not want to know who has taken it cried his wife',
 'impatiently.',
 'you want to tell me, and i have no objection to hearing it.',
 'this was invitation enough.',
 'why, my dear, you must know, mrs. long says that netherfield is',
 'taken by a young man of large fortune from the north of england;',
 '

In [52]:
# Obtenemos el tamaño de la oracion mas grande

def get_longest_sentence(corpus):
    largest = []
    for sentence in corpus:
        if len(sentence) > len(largest):
            largest = sentence
    return largest

largest = get_longest_sentence(clean)
print(largest)
print(len(largest.split(' ')))

man 2: i dont care who describes it, there is no way for it to be exaggerated. it was that bad.
21


In [53]:
# Particiones 
train, test = train_test_split(clean, test_size=0.3)
print('Número de cadenas train:',len(train))
print('Número de cadenas test:',len(test))
pickle.dump(train, open('./pickles/datasets/train.pkl','wb'))
pickle.dump(test, open('./pickles/datasets/test.pkl','wb'))

Número de cadenas train: 39753
Número de cadenas test: 17037


In [54]:
train = pickle.load(open('./pickles/datasets/train.pkl','rb'))
test = pickle.load(open('./pickles/datasets/test.pkl','rb'))

print('Número de cadenas train:',len(train))
print('Número de cadenas test:',len(test))
print(train[:3])

Número de cadenas train: 39753
Número de cadenas test: 17037
['aint no time, im facin scams nah, nah', 'to the bahamas for a little getaway.', 'stark, were on your three, headed northeast.']


In [75]:
raw_train_ds = tf.data.Dataset.from_tensor_slices(np.array(train).flatten())
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))


In [76]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b'that they had always distrusted the appearance of his goodness.'
 b'were about to die, and this is what were discussing'
 b'import all preferences from home interface.'
 b'that you dont have to do this, kat...'
 b'whatever i need to do to get a damn ride home.'
 b'now if youll excuse me, ive got tickets to watch our boys thrash rapid city.'
 b'each one you buy is a bullet in the barrel of your best guys gun.'
 b'it is a far, far better thing that i do...'
 b'i think this machine works.'
 b'the stones are in the past. we could go back, we could get them.'
 b'i couldnt let him die thinking hed won.' b'it did nothing.'
 b'my dagger, something.'
 b'so they can give you a lift past their fiveblock perimeter.'
 b'itd be nice though, right'
 b'do you remember anything  just fragments. images.'
 b'giving them a hint to be gone.' b'hes fine. not injured at all.'
 b'youre here, arent you' b'well, its only been two days, so...'
 b'back, i can call on lady lucas and mrs. long. kitty,

In [77]:
voc_size = 20406

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size,
    output_mode='int',
    output_sequence_length=22,
)

vectorize_layer.adapt(raw_train_ds, 32)
vocab = vectorize_layer.get_vocabulary()
voc_size = len(vocab)
voc_size

20406

In [78]:
vectorize_layer(['Love you', '3 millions'])

<tf.Tensor: shape=(2, 22), dtype=int64, numpy=
array([[   1,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   1, 2126,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])>

In [79]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [80]:
train_ds = raw_train_ds.map(get_input_target)

In [81]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])

(32, 21) (32, 21)
tf.Tensor(
[  29 2541  740    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0], shape=(21,), dtype=int64) tf.Tensor(
[2541  740    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0], shape=(21,), dtype=int64)


Definir modelo

In [100]:
emb_dim = 256
model_dim = 1024

In [112]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

rnn = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

Probamos el modelo

In [113]:
for input_batch, target_batch in train_ds.take(1):
    predictions = rnn(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 21, 20406) (32, 21)


In [114]:
predictions[0].shape

TensorShape([21, 20406])

In [115]:
rnn.summary()

Model: "rnn_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    multiple                  5223936   
                                                                 
 gru_11 (GRU)                multiple                  3938304   
                                                                 
 dense_16 (Dense)            multiple                  20916150  
                                                                 
Total params: 30,078,390
Trainable params: 30,078,390
Non-trainable params: 0
_________________________________________________________________


In [116]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(21,), dtype=int64, numpy=
array([ 2399, 10394,  9498,  9896, 10303,  7658,  1947,  3263, 14238,
       14839,  3249, 16005,  2308, 16238, 11766,  3114, 15281,   882,
        6707,  8107,   403])>

In [117]:
' '.join([vocab[_] for _ in input_batch[0]])

'rejected many as deficient in size and importance.             '

### Entrenamiento

In [118]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [120]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = rnn(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, rnn.trainable_weights)
    opt.apply_gradients(zip(gradients, rnn.trainable_weights))
    loss_metric(loss_value)

In [None]:
epochs = 1

for epoch in tqdm(range(epochs)):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

# Guardamos el modelo

In [None]:
rnn.save('./modelTensor/model_RNN.h5')

In [125]:
model = tf.keras.models.load_model('./modelTensor/model_RNN')

EOFError: Ran out of input

# Generación

In [None]:
states = None
start = 'tony stark'
context = tf.constant([start])
output = [start]

for i in range(21):
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)