# ETL

In [1]:
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as et 

In [2]:
xtree = et.parse("archivos/alvaro_uribe_speeches_2007_2010.xml")
xroot = xtree.getroot()

In [3]:
df_cols = ["url", "titulo", "discurso"]
rows = []

for node in xroot: 
    row = []
    
    url = node.find("url").text
    titulo = node.find("fecha").text
    discurso = node.find("discurso").text

    row.extend([url,titulo,discurso])
    rows.append(row)
    
df = pd.DataFrame(rows, columns = df_cols)

In [4]:
def fix_discurso(row):
    discurso = " ".join(row['discurso'].split('\n')[2:])
    fecha_lugar = row['discurso'].split('\n')[1]
    fecha = fecha_lugar[fecha_lugar.index('('):]
    lugar = fecha_lugar[:fecha_lugar.index('(')].strip()
    return fecha, lugar, discurso

In [5]:
df[['fecha', 'lugar', 'discurso']] = df.apply(fix_discurso, axis=1, result_type="expand")

In [6]:
df

Unnamed: 0,url,titulo,discurso,fecha,lugar
0,http://web.presidencia.gov.co/discursos/discur...,turistica_24022010,“Yo quiero felicitarlos d...,(Bogotá),Febrero 24 de 2010
1,http://web.presidencia.gov.co/discursos/discur...,lloreda_20012010,“Habríamos querido tener ...,(Bogotá),Enero 20 de 2009
2,http://web.presidencia.gov.co/discursos/discur...,bananeros_29042010,"Medellín, 29 abr (SP). “N...",(Medellín),29 de abril de 2010
3,http://web.presidencia.gov.co/discursos/discur...,ccg264_31012010,“Muy apreciados compatrio...,(Bucaramanga),Enero 31 de 2010
4,http://web.presidencia.gov.co/discursos/discur...,audiovisual_21012010,“Una ...,(Bogotá),Enero 21 de 2009
...,...,...,...,...,...
245,http://web.presidencia.gov.co/discursos/discur...,ccg291_29052010,"“Apreciados compatriotas,...",(Bogotá),29 de mayo de 2010
246,http://web.presidencia.gov.co/discursos/discur...,ccges_20022010,"“A esta hora, unos compañ...",(Bogotá),Febrero 20 de 2010
247,http://web.presidencia.gov.co/discursos/discur...,circulo_161207,“Mil ...,(Medellín - Antioquia),Diciembre 16 de 2007
248,http://web.presidencia.gov.co/discursos/discur...,ccg282_18042010,“Es muy grato estar nueva...,"(Cajica, Cundinamarca)",18 de abril de 2010


In [68]:
df.to_csv("archivos/uribe.csv", index=False)

In [7]:
text = df.discurso.str.cat(sep=" ").strip()

In [8]:
print(f"{len(text):,.0f} caracteres y {len(text.split(' ')):,.0f} palabras")

5,409,192 caracteres y 1,376,723 palabras


# RNN

In [9]:
import tensorflow as tf

#### Vectorize the text
Before training, you need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [10]:
vocab = sorted(set(text))
print(f'{len(vocab):,.0f} caracteres únicos')

104 caracteres únicos


In [11]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

In [12]:
# Traducir a números
text_as_int = np.array([char2idx[c] for c in text])

In [13]:
np.array([char2idx[c] for c in "Hola, ¿cómo estás?"])

array([29, 62, 59, 48,  5,  0, 81, 50, 93, 60, 62,  0, 52, 66, 67, 87, 66,
       21])

#### The prediction task
Given a character, or a sequence of characters, what is the most probable next character? This is the task you're training the model to perform. The input to the model will be a sequence of characters, and you train the model to predict the output—the following character at each time step.


Since RNNs maintain an internal state that depends on the previously seen elements, given all the characters computed until this moment, what is the next character?

#### Create training examples and targets
Next divide the text into example sequences. Each input sequence will contain seq_length characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So break the text into chunks of seq_length+1. For example, say seq_length is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".



#### Example

```python
seq_length = 4

Text = "Hello"

Input = "Hell"
Output = "ello"
```

In [14]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1) # División entera

In [15]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [17]:
split_input_target("Hola, ¿cómo estás?")

('Hola, ¿cómo estás', 'ola, ¿cómo estás?')

In [18]:
dataset = sequences.map(split_input_target)

In [19]:
for i,(input_example, target_example) in  enumerate(dataset.take(1)):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  '“Yo quiero felicitarlos de todo corazón por esta nueva  Vitrina Turística que tanto ayuda a Colombia'
Target data: 'Yo quiero felicitarlos de todo corazón por esta nueva  Vitrina Turística que tanto ayuda a Colombia.'


In [20]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print(f"Paso {i}")
    print(f"  input: {input_idx} ({repr(idx2char[input_idx])})")
    print(f"  expected output: {target_idx} ({repr(idx2char[target_idx])})")
    print()

Paso 0
  input: 100 ('“')
  expected output: 46 ('Y')

Paso 1
  input: 46 ('Y')
  expected output: 62 ('o')

Paso 2
  input: 62 ('o')
  expected output: 0 (' ')

Paso 3
  input: 0 (' ')
  expected output: 64 ('q')

Paso 4
  input: 64 ('q')
  expected output: 68 ('u')



#### Create training batches
You used tf.data to split the text into manageable sequences. But before feeding this data into the model, you need to shuffle the data and pack it into batches.

In [21]:
# Batch size (number of samples before a model is updated)
BATCH_SIZE = 64

# Avoid shuffling the entire dataset
BUFFER_SIZE = 10000

In [22]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [23]:
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#### Build The Model
Use tf.keras.Sequential to define the model. For this simple example three layers are used to define our model:

- tf.keras.layers.Embedding: The input layer. A trainable lookup table that will map the numbers of each character to a vector with embedding_dim dimensions;
- tf.keras.layers.GRU: A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)
- tf.keras.layers.Dense: The output layer, with vocab_size outputs.

In [24]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 64

# Number of RNN units
rnn_units = 512

In [25]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [26]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

#### Try the model
Now run the model to see that it behaves as expected.

In [27]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 104) # (batch_size, sequence_length, vocab_size)


In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 64)            6656      
_________________________________________________________________
gru (GRU)                    (64, None, 512)           887808    
_________________________________________________________________
dense (Dense)                (64, None, 104)           53352     
Total params: 947,816
Trainable params: 947,816
Non-trainable params: 0
_________________________________________________________________


#### Sample

In [29]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [30]:
sampled_indices

array([ 84,  14,  35,  47,  10,  50,  38,  76,  74,  96,  43,  72,  46,
        13,  49,  83,   5,  14,  62,  43,   5,  27,  35,  66,   7,  88,
        68,  58,  11,  10,  69,  67, 100,  74,  35,  63,  83,  21,  10,
        94,  68,  25,  24,  49,   2,  12,  52,  28,  86,  19,  86,  83,
        61,  93,  86,  20,  82,  33,  61,  55,  30,   3,  53,  75,  98,
        31,  74,  20, 102,  59,  69,  41,  23,  51,  88,  23,  37,  67,
        32,  56,  68,  98,  16,  37,  41,  96,  42,  70, 103,  78,  43,
       103,  16,  37,  40,  15,  15,   9,  76,  40])

In [31]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'rcito por orden del Presidente, para frustrar el acuerdo   humanitario.                        Las  '

Next Char Predictions: 
 "Í5NZ1cQ¡|–VyY4bÉ,5oV,FNs.äuk21vt“|NpÉ?1úuDCb'3eGÚ:ÚÉnóÚ;ÁLnhI(f\xa0‘J|;•lvTBdäBPtKiu‘7PT–Uw…°V…7PS660¡S"


#### Train the model 

At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

#### Attach an optimizer, and a loss function
The standard tf.keras.losses.sparse_categorical_crossentropy loss function works in this case because it is applied across the last dimension of the predictions.

Because your model returns logits, you need to set the from_logits flag.



In [32]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 104)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.6457105


In [50]:
model.compile(optimizer='adam', loss=loss)

In [51]:
checkpoint_dir = 'archivos/checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [52]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [60]:
# number of complete passes through the training dataset
EPOCHS = 100

In [61]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [55]:
tf.train.latest_checkpoint(checkpoint_dir)

'archivos/checkpoints/ckpt_1'

In [56]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 64)             6656      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 512)            887808    
_________________________________________________________________
dense_2 (Dense)              (1, None, 104)            53352     
Total params: 947,816
Trainable params: 947,816
Non-trainable params: 0
_________________________________________________________________


In [57]:
def generate_text(model, start_string, num_generate = 300):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [59]:
print(generate_text(model, start_string=u"Compatriotas "))

Compatriotas fár aluctores, muy impertarios. Nosotros gra dos navios. Bon no hoy avanzado ha sido   toda, las hemasicionas hacenes hoy que targunta el año  estápamiento de Agriendo, que apciente no   hambiénes  cama un canrestros como inversiones, es un   Val nañoles. Andidos más huba resordafen de osclimos has 


In [67]:
EPOCHS = 100
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss)
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100


ValueError: in user code:

    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py:372 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/layers/recurrent.py:659 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:227 assert_input_compatibility
        ', found shape=' + str(shape))

    ValueError: Input 0 is incompatible with layer gru_2: expected shape=(1, None, 64), found shape=[64, 100, 64]
