In [1]:
import tensorflow as tf
import numpy as np
import os

In [2]:
path_to_file = 'evgenyi_onegin.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [3]:
vocab = sorted(set(text))

char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

seq_length = 100


char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [4]:
def split_input_target(chunk):
    input_text = chunk[:-1]  # Removes the last character
    target_text = chunk[1:]  # Removes the first character
    return input_text, target_text

In [5]:
dataset = sequences.map(split_input_target)

In [6]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'Александр Сергеевич Пушкин\n\n                                Евгений Онегин\n                         '
Target data: 'лександр Сергеевич Пушкин\n\n                                Евгений Онегин\n                          '


In [7]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [8]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [9]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)])
    return model

In [10]:
model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           33536     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 131)           134275    
                                                                 
Total params: 4,106,115
Trainable params: 4,106,115
Non-trainable params: 0
_________________________________________________________________


In [12]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [13]:
# Define checkpoint path for each batch
checkpoint_path = "model_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True)

In [14]:
EPOCHS = 80
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/80
Epoch 1: saving model to model_checkpoints\cp-0001.ckpt
Epoch 2/80
Epoch 2: saving model to model_checkpoints\cp-0002.ckpt
Epoch 3/80
Epoch 3: saving model to model_checkpoints\cp-0003.ckpt
Epoch 4/80
Epoch 4: saving model to model_checkpoints\cp-0004.ckpt
Epoch 5/80
Epoch 5: saving model to model_checkpoints\cp-0005.ckpt
Epoch 6/80
Epoch 6: saving model to model_checkpoints\cp-0006.ckpt
Epoch 7/80
Epoch 7: saving model to model_checkpoints\cp-0007.ckpt
Epoch 8/80
Epoch 8: saving model to model_checkpoints\cp-0008.ckpt
Epoch 9/80
Epoch 9: saving model to model_checkpoints\cp-0009.ckpt
Epoch 10/80
Epoch 10: saving model to model_checkpoints\cp-0010.ckpt
Epoch 11/80
Epoch 11: saving model to model_checkpoints\cp-0011.ckpt
Epoch 12/80
Epoch 12: saving model to model_checkpoints\cp-0012.ckpt
Epoch 13/80
Epoch 13: saving model to model_checkpoints\cp-0013.ckpt
Epoch 14/80
Epoch 14: saving model to model_checkpoints\cp-0014.ckpt
Epoch 15/80
Epoch 15: saving model to model_checkpoi

In [15]:
def generate_text(model, start_string, num_generate=1000, temperature=1.0):

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []
    model.reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [16]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [17]:
predicted_text = generate_text(
    model, start_string='И вот идет уже ', num_generate=1000, temperature=1.0)

print(predicted_text)

И вот идет уже друг;
                        Крястила при лане музу век
                        Знака зы не замечает,
                         Походкой твердой, тихо, ровно
                        Четыре перешли шага,
                               И об Глаза С мечты;
                                  S, взорем, я знаю, краса
                        Вражда, надежда и забир,
                        Что нас за столовые заставил,
                             Лета шалун и жизнь робает,
                         Была наука страсти нежной,
                       Кучты, милые друзья! {40}

                                     By, мой пред астой шаровил:
                        Она глядит ему в лицо.
                       ".
                Погрейся о  На шубах у свест гостей                  LII

                        Не так ли я? где же вхлу.
                        Не разойтиться ль полюбовно?..
                       Но, шумом бала утомленный
                        И утром должен быть у