<a href="https://colab.research.google.com/github/Khlebovich-Alexandra/horoscope_generator/blob/master/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text generation with an RNN

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM 
from tensorflow.keras.utils import to_categorical

import numpy as np
import pandas as pd
import itertools
import os
import re
import time
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [0]:
posts = pd.read_csv('https://raw.githubusercontent.com/Khlebovich-Alexandra/horoscope_generator/master/Data/final_posts.csv', index_col=0)

In [0]:
posts

Unnamed: 0,text,length,date,type,index in posts,domain,index_before_concat
0,сегодня вы можете почувствовать незащищенность...,339,11 декабря,business,0,ribyhoroscop,0
1,вам звёзды рекомендуют сегодня больше работать...,303,11 декабря,love,0,ribyhoroscop,1
2,"если сегодня вы почувствуете робость, или, тог...",215,11 декабря,simple,0,ribyhoroscop,2
3,сегодня звезды рекомендуют вам заняться коррек...,278,10 декабря,business,1,ribyhoroscop,3
4,"сегодня ваш любимый человек признается в том, ...",273,10 декабря,love,1,ribyhoroscop,4
...,...,...,...,...,...,...,...
26957,"день преобразования космических энергий, получ...",544,2 марта,simple,2648,strelechoroscop,4575
26958,завтра - полон перемен для вас. поэтому строит...,256,1 марта,simple,2649,strelechoroscop,4576
26959,утром не мешкая приступайте к работе или завяз...,441,28 февраля,simple,2650,strelechoroscop,4577
26960,общительность и дружелюбие вас сегодня могут н...,453,27 февраля,simple,2651,strelechoroscop,4578


In [0]:
posts.drop(['date', 'type', 'index in posts', 'domain', 'index_before_concat'], axis=1, inplace=True)

In [0]:
posts.drop(np.argmax(posts.length), inplace=True)

In [0]:
def add_spaces(x):
    literals = ',.?!;:()'
    for literal in literals:
        if literal in x:
            x = x.replace(literal, ' ' + literal + ' ')
            
    pattern_bad_spaces_1 = re.compile(r'(^ +)|( +$)')
    pattern_bad_spaces_2 = re.compile(r'(  +)|(\t)')
    x = re.sub(pattern_bad_spaces_1, '', x)
    x = re.sub(pattern_bad_spaces_2, ' ', x)
    return x

In [0]:
def split(x):
    res = re.split(re.compile(r'[ ]+'), x.lower())
    while '' in res:
        res.remove('')
    return res

In [0]:
def create_vocab(texts):
    texts = pd.Series(texts)
    texts = texts.map(add_spaces)
    texts = texts.map(split)
    word_index = {}
    word_index["<Заполнитель>"] = 0
    word_index["<Начало последовательности>"] = 1
    word_index["<Не используется>"] = 2
    word_index["<Неизвестное слово>"] = 3
    ind = 4
    for text in texts:
        for word in text:
            if not (word in word_index):
                word_index[word] = ind
                ind += 1
    return word_index

In [0]:
def word_to_int(word):
    if word in word_index:
        return word_index[word]
    return 3

In [0]:
def get_word_by_index(index):
    for word, value in word_index.items():
        if index == value:
            return word
    return ''

In [0]:
def create_dataset(text):
    text = add_spaces(text)
    text = split(text)
    text = list(map(word_to_int, text))
    return text

In [0]:
word_index = create_vocab(posts.text)

In [0]:
word_index['.']

14

In [211]:
len(word_index)

47923

In [0]:
print(create_dataset(posts.text[0]))

[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 14, 28, 29, 30, 31, 32, 33, 19, 34, 35, 36, 37, 19, 38, 39, 40, 41, 42, 43, 44, 14, 45, 46, 47, 25, 48, 49, 14]


In [0]:
posts.text = posts.text.map(create_dataset)

In [0]:
posts.length = posts.text.map(len)

Save DataFrame and vocabluary

In [0]:
posts.to_csv('posts_word_to_int.csv')

In [0]:
import json

with open('vocab.json', 'w') as fp:
    json.dump(word_index, fp)

Padding seq

In [0]:
def get_list_of_ints(x):
    if isinstance(x, str):
        x = x.strip('][').split(', ')
    return list(map(int, x))

In [0]:
posts = pd.read_csv('posts_word_to_int.csv', index_col=0)
posts.text = posts.text.map(get_list_of_ints)
posts

Unnamed: 0,text,length
0,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",53
1,"[50, 51, 52, 4, 53, 54, 14, 55, 56, 57, 58, 59...",49
2,"[89, 4, 5, 90, 91, 19, 56, 19, 92, 93, 94, 19,...",40
3,"[4, 109, 52, 50, 110, 111, 56, 112, 113, 114, ...",46
4,"[4, 136, 137, 138, 139, 11, 140, 19, 20, 141, ...",46
...,...,...
26957,"[199, 7429, 7906, 19954, 19, 8101, 20213, 8982...",81
26958,"[3016, 81, 2294, 3598, 47, 25, 14, 302, 6957, ...",40
26959,"[8803, 121, 47918, 13480, 71, 535, 56, 35595, ...",63
26960,"[8673, 9, 5898, 25, 4, 702, 23496, 87, 19815, ...",68


In [0]:
posts.describe()

Unnamed: 0,length
count,26961.0
mean,50.23764
std,15.919764
min,4.0
25%,40.0
50%,48.0
75%,58.0
max,129.0


In [0]:
maxlen = 50

In [0]:
texts = pad_sequences(posts.text, maxlen, padding='post')
texts = tf.data.Dataset.from_tensor_slices(texts)

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = texts.map(split_input_target)

In [278]:
dataset.take(1)

<TakeDataset shapes: ((49,), (49,)), types: (tf.int32, tf.int32)>

In [282]:
for i, (input_idx, target_idx) in enumerate(zip(dataset[:5], dataset[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, get_word_by_index(input_idx)))
    print("  expected output: {} ({:s})".format(target_idx, 
                                                get_word_by_index(target_idx)))

TypeError: ignored

In [280]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 49), (64, 49)), types: (tf.int32, tf.int32)>

In [283]:
# define model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[BATCH_SIZE, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])

print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (64, None, 256)           12268288  
_________________________________________________________________
gru_6 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_12 (Dense)             (64, None, 47923)         49121075  
Total params: 65,327,667
Trainable params: 65,327,667
Non-trainable params: 0
_________________________________________________________________
None


In [284]:
 for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 49, 47923) # (batch_size, sequence_length, vocab_size)


In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [286]:
sampled_indices

array([ 9981,  6889, 36817, 19912, 31095, 37149, 27442, 33349, 16949,
       13260, 17136, 19531, 22942, 11543, 46208, 21612, 30696, 24334,
       13966,  9663, 14451, 39965,  9993, 10153, 36351, 41017,  7143,
       37678, 21416, 29545, 11653, 10981,  6204, 11310, 17756, 33332,
       40570,  8428, 45829, 19308, 33244, 20903, 46799, 41686, 17884,
        4065, 33484, 14124, 19611])

Decode these to see the text predicted by this untrained model:

In [287]:
print("Input: \n", repr(" ".join([get_word_by_index(i) for i in input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr(" ".join([get_word_by_index(i) for i in sampled_indices])))

Input: 
 'это хорошее время для зарождения новых деловых проектов , переговоров и бесед , из которых можно почерпнуть много новых идей . для любви это волшебное , но несколько странное время . отношениям может не хватать тепла и нежности , но возможно быстрое сближение на уровне общих интересов и увлечений'

Next Char Predictions: 
 'волнуетесь случая приживется приукрашенном руководят мотивирует затруднительна срывов косвенное грандиозные десятка отклонять несбывшимися неотложных необычна безотказное уютнее необъятное моральным ревнивец спешка неуютного драгоценное средстве доверяют подхватывать банковский современная разойтись несправедливостью шоппинг встречался удовольствий утренние сосредоточенными «завернуть» пищевой беспрестанно рассеивайтесь кране знаеть недоумевающим обзаведитесь дешёвый взбодрит причиной требовательностью осмотрительней упростить'


## Train the model

The standard `tf.keras.losses.sparse_categorical_crossentropy` loss function works in this case because it is applied across the last dimension of the predictions.

Because our model returns logits, we need to set the `from_logits` flag.


In [288]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 49, 47923)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       10.777245


Configure the training procedure using the `tf.keras.Model.compile` method. We'll use `tf.keras.optimizers.Adam` with default arguments and the loss function.

In [0]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints

Use a `tf.keras.callbacks.ModelCheckpoint` to ensure that checkpoints are saved during training:

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

To keep training time reasonable, use 10 epochs to train the model. In Colab, set the runtime to GPU for faster training.

In [0]:
EPOCHS=1

In [292]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Train for 421 steps


## Generate text

### Restore the latest checkpoint

To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different `batch_size`, we need to rebuild the model and restore the weights from the checkpoint.


In [293]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_1'

In [304]:
tf.train.load_checkpoint('./training_checkpoints/ckpt_40')

<tensorflow.python._pywrap_checkpoint_reader.CheckpointReader at 0x7faa700d8570>

In [329]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[1, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])

model.load_weights('./training_checkpoints/ckpt_40')

model.build(tf.TensorShape([1, None]))



In [330]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (1, None, 256)            12268288  
_________________________________________________________________
gru_10 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
dense_16 (Dense)             (1, None, 47923)          49121075  
Total params: 65,327,667
Trainable params: 65,327,667
Non-trainable params: 0
_________________________________________________________________


### The prediction loop

The following code block generates the text:

* It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.

* Get the prediction distribution of the next character using the start string and the RNN state.

* Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

* The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one word. After predicting the next word, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.


![To generate text the model's output is fed back to the input](https://github.com/Khlebovich-Alexandra/horoscope_generator/blob/master/images/text_generation_sampling.png?raw=1)

Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [0]:
def generate_text(model, start_string, num_of_words, temp):
  # Evaluation step (generating text using the learned model)

  # Converting our start string to numbers (vectorizing)
  text = start_string

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = temp

  # Here batch size == 1
  model.reset_states()
  for i in range(num_of_words):
      input_eval = create_dataset(start_string)
      input_eval = pad_sequences([input_eval], maxlen - 1, padding='post')

      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      new_word = get_word_by_index(predicted_id)
      text = text + ' ' + new_word

  return text

In [338]:
generate_text(model, 'сегодня вам ', 10, 4.5)

'сегодня вам  стартовые энергетической более <Заполнитель> жребий обстоятельству следствие роднёй поворотами крайностей'

In [332]:
generate_text(model, 'сегодня вам ', 10, 55)

'сегодня вам  фотоаппарат новому запускайте поднятие утерянную содействовать заслуживает прослужат сходив зачем-то'

In [333]:
generate_text(model, 'сегодня вам ', 10, 28)

'сегодня вам  лестнице изрядной препятствий запоминающиеся пожелание моут участвующим основанной спонсорство раскритиковать'

The easiest thing you can do to improve the results it to train it for longer (try `EPOCHS=30`).

You can also experiment with a different start string, or try adding another RNN layer to improve the model's accuracy, or adjusting the temperature parameter to generate more or less random predictions.

## Advanced: Customized Training

The above training procedure is simple, but does not give you much control.

So now that you've seen how to run the model manually let's unpack the training loop, and implement it ourselves. This gives a starting point if, for example, to implement _curriculum learning_ to help stabilize the model's open-loop output.

We will use `tf.GradientTape` to track the gradients. You can learn more about this approach by reading the [eager execution guide](https://www.tensorflow.org/guide/eager).

The procedure works as follows:

* First, initialize the RNN state. We do this by calling the `tf.keras.Model.reset_states` method.

* Next, iterate over the dataset (batch by batch) and calculate the *predictions* associated with each.

* Open a `tf.GradientTape`, and calculate the predictions and loss in that context.

* Calculate the gradients of the loss with respect to the model variables using the `tf.GradientTape.grads` method.

* Finally, take a step downwards by using the optimizer's `tf.train.Optimizer.apply_gradients` method.



In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [0]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

In [0]:
!ls

sample_data  training_checkpoints


In [0]:
model_json = model.to_json()
json_file = open('posts_model.json', 'w')
json_file.write(model_json)
json_file.close()
model.save_weights('posts_model.h5')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ls

posts_word_to_int.csv  sample_data  vocab.json


In [0]:
from google.colab import files

In [0]:
files.download('posts_word_to_int.csv')

In [0]:
files.download('vocab.json')