# Proyecto final

In [31]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers
import pickle

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [4]:
corpus = pickle.load(open('./corpus.pkl', 'rb'))
corpus = [x for s in corpus for x in s]
corpus = ' '.join(corpus)
corpus[:100]

'1 truth universally acknowledged single man possession good fortune must want wife however little kn'

In [5]:
tokenizer = tf_text.UnicodeScriptTokenizer()
movies_tokens =  tokenizer.tokenize([corpus]).to_list()[0]
movies_tokens[:10]

[b'1',
 b'truth',
 b'universally',
 b'acknowledged',
 b'single',
 b'man',
 b'possession',
 b'good',
 b'fortune',
 b'must']

In [6]:
words_ds = tf.data.Dataset.from_tensor_slices(movies_tokens)

In [7]:
for words in words_ds.take(20):
    print(words.numpy())

b'1'
b'truth'
b'universally'
b'acknowledged'
b'single'
b'man'
b'possession'
b'good'
b'fortune'
b'must'
b'want'
b'wife'
b'however'
b'little'
b'known'
b'feelings'
b'views'
b'man'
b'may'
b'first'


In [8]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

[b'1' b'truth' b'universally' b'acknowledged' b'single' b'man'
 b'possession' b'good' b'fortune' b'must' b'want' b'wife' b'however'
 b'little' b'known' b'feelings' b'views' b'man' b'may' b'first'
 b'entering' b'neighbourhood' b'truth' b'well' b'fixed' b'minds'
 b'surrounding' b'families' b'considered' b'rightful' b'property' b'one'
 b'daughters' b'dear' b'mr' b'bennet' b'said' b'lady' b'one' b'day'
 b'heard' b'netherfield' b'park' b'let' b'last' b'mr' b'bennet' b'replied'
 b'returned' b'mrs' b'long']


In [9]:
def join_strings(tokens):
    return tf.strings.reduce_join(tokens, axis=0, separator=' ')

In [10]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [11]:
for batch in raw_train_ds.take(1):
    print(batch)

tf.Tensor(
[b'safe difficult times us must stand together good asgard yes course good wait word may beg indulgence majesty perhaps reconsider done scoffs need horse nt horses dogs cats birds give one large enough ride honking still need lift never done anything like ever done anything like many times brave well stole'
 b'got got first like said ca nt stay groans move right behind pepper get gonna find way around stop stopping get get outside go pepper straining repulsor fires scream groaning grunting ch god tony groaning straining jarvis sir ms potts clear structure straining machine gun firing grunting groans mechanical whirring gasps'
 b'could shut ned hi captain america whether classroom battlefield know met stole shield today good friend gym teacher conducting captain america fitness challenge thank captain pretty sure war criminal show videos required state let avengers pay taxes hulk smell like bet smells nice shut captain america cool like mean old grandpa'
 b'nt childishly cold

In [12]:
voc_size = 16380

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode='int',
    output_sequence_length=seq_length + 1,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()
voc_size = len(vocab)
voc_size

16354

In [13]:
vectorize_layer(['Love you', '3 millions'])

<tf.Tensor: shape=(2, 51), dtype=int64, numpy=
array([[   1,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [2728, 1852,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [14]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [15]:
train_ds = raw_train_ds.map(get_input_target)

In [16]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])

(32, 50) (32, 50)
tf.Tensor(
[  216    43    37     2   117   333  1145 13540  2540   767  1630    28
    35   564   235   589   411   580    13  4528  1628   227   296  6251
  4900  1392   361  4738  2316   503  4168   244 10038  1790   805   230
  4964   391     9   140   875   349 14176   233  5573  2864   833     2
  1306   281], shape=(50,), dtype=int64) tf.Tensor(
[   43    37     2   117   333  1145 13540  2540   767  1630    28    35
   564   235   589   411   580    13  4528  1628   227   296  6251  4900
  1392   361  4738  2316   503  4168   244 10038  1790   805   230  4964
   391     9   140   875   349 14176   233  5573  2864   833     2  1306
   281  1650], shape=(50,), dtype=int64)


Definir modelo

In [17]:
emb_dim = 256
model_dim = 1024

In [18]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [19]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

(32, 50, 16354) (32, 50)


In [20]:
model.summary()

Model: "rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  4186624   
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  16762850  
                                                                 
Total params: 24,887,778
Trainable params: 24,887,778
Non-trainable params: 0
_________________________________________________________________


In [21]:
predictions[0].shape

TensorShape([50, 16354])

In [22]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

<tf.Tensor: shape=(50,), dtype=int64, numpy=
array([ 9440, 12747,  7556, 11468, 14357, 15374,  9365,  4780,  1292,
       11419, 14064,  8400,   353,  4579, 14653, 12278, 14563,  5589,
         865,  5567, 16116,  7819,  6907,   460,  3867, 14386,  1099,
        1451, 12952, 14009,  6716,  8379, 15562, 14306,  8918, 14580,
       11460, 14858,  3575,  3011, 13421,  1332, 13786, 12325,  5628,
       14996, 10755, 14855,  4454,  2481])>

Obtener palabras a travez de indices con vocab

In [23]:
' '.join([vocab[_] for _ in input_batch[0]])

'ten years ago lazarus missions prof brand twelve possible worlds twelve ranger launches carrying bravest humans ever live led remarkable dr mann doyle person landing pod enough life support two years could use hibernation stretch making observations organics decade mission assess world showed potential could send signal bed long nap'

In [24]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

'willfully libertywhich roads puff duplicitous carpenter yahtzee western seat quim faithfully freddy knows disposal designto nada disappears analysis soul atom airduct pays anna full mortification drooping shouting anxiety jilt fibrillation contrived gaieties bosoms elbow clearer diminishes punched crib display stammering hes bitch fusing mountaintops 35 consumes shrouded crimefighting inconvenient central'

# Entrenamiento

In [25]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [26]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [27]:
epochs = 100

In [None]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

# Guardamos el modelo

In [None]:
model.save('./modelTensor/model_RNN.h5')

In [None]:
model2 = tf.keras.models.load_model('./modelTensor/model_RNN')

# Generaci√≥n

In [None]:
states = None
start = 'tony stark'
context = tf.constant([start])
output = [start]

for i in range(50):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)