In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.6.4
sys.version_info(major=3, minor=7, micro=12, releaselevel='final', serial=0)
matplotlib 3.5.2
numpy 1.21.6
pandas 1.3.5
sklearn 1.0.2
tensorflow 2.6.4
keras.api._v2.keras 2.6.0


In [3]:
input_filepath = '../input/shakespeare/shakespeare.txt'
text = open(input_filepath, 'r').read()
print(len(text))
print(text[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
char2index = {char: index for index, char in enumerate(vocab)}
print(char2index)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
index2char = np.array(vocab)
print(index2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [7]:
text_as_int = np.array([char2index[c] for c in text])
print(text_as_int.shape)
print(len(text_as_int))
print(text_as_int[:10])
print(text[:10])

(1115394,)
1115394
[18 47 56 57 58  1 15 47 58 47]
First Citi


In [8]:
def split_input_target(id_text):
    """abcde -> abcd,bcde  输入是abcd，输出是bcde"""
    return id_text[0:-1], id_text[1:]

In [9]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequence_length = 100
# 输入sequence_length+1各字符 返回sequence_length个字符
# batch将字符转换为句子序列  drop_remainder丢掉最后不足一批的余数
sequence_dataset = char_dataset.batch(sequence_length + 1, drop_remainder=True)

for char_id in char_dataset.take(2):
    print(char_id, index2char[char_id.numpy()])
for sequence_id in sequence_dataset.take(2):
    print(sequence_id)
    print(repr(''.join(index2char[sequence_id.numpy()])))

2022-08-03 14:27:42.159834: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:27:42.168578: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:27:42.169317: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:27:42.170895: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


2022-08-03 14:27:43.192977: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2022-08-03 14:27:43.199770: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


In [10]:
# 做映射得到输入和输出
sequence_dataset = sequence_dataset.map(split_input_target)
for item_input, item_output in sequence_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
print(sequence_dataset)

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

2022-08-03 14:27:43.267427: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [11]:
batch_size = 64
buffer_size = 10000
sequence_dataset = sequence_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
print(sequence_dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


In [12]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        keras.layers.LSTM(units=rnn_units,
                          stateful=True,  # 是否将这一批次中索引i的每个样本的最后状态用作下一个批次中索引i的样本的初始状态
                          recurrent_initializer='glorot_uniform',  # 初始化权重矩阵，用于循环状态的线性转换
                          return_sequences=True),  # 返回所有输出
        keras.layers.Dense(vocab_size),
    ])
    return model


model = build_model(vocab_size=vocab_size, embedding_dim=embedding_dim,
                    rnn_units=rnn_units, batch_size=batch_size)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
print(model.variables)

[<tf.Variable 'embedding/embeddings:0' shape=(65, 256) dtype=float32, numpy=
array([[-0.04525001,  0.02787185, -0.00784376, ...,  0.00832462,
         0.03347795,  0.01996693],
       [-0.01920562, -0.02134976,  0.03557615, ...,  0.00579132,
         0.00097162,  0.04300029],
       [ 0.04539466, -0.02200407, -0.03014871, ..., -0.01022156,
        -0.00414542,  0.04848817],
       ...,
       [ 0.02276044,  0.01871764,  0.00567186, ...,  0.02976039,
         0.00676261,  0.04662025],
       [-0.02065388, -0.03882288, -0.04119078, ..., -0.03003141,
         0.0051864 ,  0.03145988],
       [-0.03321948, -0.02551397,  0.04652753, ..., -0.00721079,
        -0.02157035, -0.02054813]], dtype=float32)>, <tf.Variable 'lstm/lstm_cell/kernel:0' shape=(256, 4096) dtype=float32, numpy=
array([[ 0.0209539 ,  0.00756338, -0.01243896, ...,  0.03083608,
         0.00919849, -0.00302912],
       [ 0.03646383,  0.01684374, -0.02385048, ..., -0.02545188,
         0.00052806,  0.03067524],
       [-0.002

In [14]:
for input_example_batch, target_example_batch in sequence_dataset.take(1):
    # 把model当函数来用，实际是调用类的call方法
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

2022-08-03 14:27:45.826129: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


(64, 100, 65)


In [15]:
print(example_batch_predictions)

tf.Tensor(
[[[-2.67272093e-03  1.63642666e-03 -6.87235675e-04 ... -1.64137292e-03
    1.56671787e-03 -4.15825576e-04]
  [-7.07834261e-04  9.96763818e-03 -1.59643730e-03 ... -7.33956462e-04
    2.03338522e-03  1.11386541e-03]
  [-2.00913707e-03  7.06743961e-03  5.34977438e-03 ... -2.78298790e-03
    5.05400263e-03  2.91223056e-03]
  ...
  [ 2.47331802e-04  1.14464862e-02 -1.03832269e-03 ... -2.67823134e-03
   -9.56431776e-03  2.31411238e-03]
  [-4.70491592e-04  1.59614123e-02 -1.77025143e-03 ... -1.46620208e-03
   -7.91234430e-03  2.89739715e-03]
  [-2.33091135e-03  1.82699747e-02 -6.04616944e-04 ... -4.46212944e-03
   -1.57148056e-02  1.55771314e-03]]

 [[ 3.89983086e-03  6.08957489e-04  6.29221089e-04 ...  2.52489024e-03
    4.47379984e-03 -4.25819121e-03]
  [ 5.79379778e-03  4.06516995e-03  6.51185168e-03 ... -2.71552638e-03
   -1.00101484e-03  1.57996127e-03]
  [ 2.42702663e-04  9.68904234e-04  5.15099522e-03 ...  3.08281416e-03
   -1.36368326e-03  8.99227336e-03]
  ...
  [-1.027358

In [16]:
print(example_batch_predictions[0])

tf.Tensor(
[[-0.00267272  0.00163643 -0.00068724 ... -0.00164137  0.00156672
  -0.00041583]
 [-0.00070783  0.00996764 -0.00159644 ... -0.00073396  0.00203339
   0.00111387]
 [-0.00200914  0.00706744  0.00534977 ... -0.00278299  0.005054
   0.00291223]
 ...
 [ 0.00024733  0.01144649 -0.00103832 ... -0.00267823 -0.00956432
   0.00231411]
 [-0.00047049  0.01596141 -0.00177025 ... -0.0014662  -0.00791234
   0.0028974 ]
 [-0.00233091  0.01826997 -0.00060462 ... -0.00446213 -0.01571481
   0.00155771]], shape=(100, 65), dtype=float32)


In [17]:
sample_indices = tf.random.categorical(
    logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
print('-' * 50)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices)  # (100, )

tf.Tensor(
[[47]
 [25]
 [ 8]
 [21]
 [ 3]
 [43]
 [62]
 [24]
 [52]
 [15]
 [30]
 [59]
 [54]
 [55]
 [50]
 [25]
 [56]
 [18]
 [ 5]
 [31]
 [11]
 [64]
 [24]
 [20]
 [25]
 [31]
 [10]
 [59]
 [30]
 [53]
 [21]
 [31]
 [19]
 [10]
 [44]
 [21]
 [64]
 [52]
 [32]
 [51]
 [30]
 [49]
 [17]
 [ 7]
 [ 5]
 [49]
 [54]
 [ 8]
 [41]
 [ 7]
 [63]
 [31]
 [ 9]
 [48]
 [ 9]
 [28]
 [20]
 [36]
 [31]
 [16]
 [10]
 [17]
 [58]
 [31]
 [49]
 [ 2]
 [52]
 [37]
 [38]
 [32]
 [49]
 [28]
 [59]
 [ 4]
 [32]
 [23]
 [41]
 [31]
 [26]
 [22]
 [21]
 [29]
 [53]
 [11]
 [23]
 [55]
 [53]
 [30]
 [33]
 [14]
 [52]
 [39]
 [44]
 [62]
 [43]
 [25]
 [51]
 [53]
 [31]
 [56]], shape=(100, 1), dtype=int64)
--------------------------------------------------
tf.Tensor(
[47 25  8 21  3 43 62 24 52 15 30 59 54 55 50 25 56 18  5 31 11 64 24 20
 25 31 10 59 30 53 21 31 19 10 44 21 64 52 32 51 30 49 17  7  5 49 54  8
 41  7 63 31  9 48  9 28 20 36 31 16 10 17 58 31 49  2 52 37 38 32 49 28
 59  4 32 23 41 31 26 22 21 29 53 11 23 55 53 30 33 14 52 39 44 62 43 25
 51 

In [18]:
print('Input:', repr(''.join(index2char[input_example_batch[0]])))
print('-' * 50)
print('Output:', repr(''.join(index2char[target_example_batch[0]])))
print('-' * 50)
print('Predictions:', repr(''.join(index2char[sample_indices])))

Input: ": this must be patch'd\nWith cloth of any colour.\n\nCOMINIUS:\nNay, come away.\n\nA Patrician:\nThis man h"
--------------------------------------------------
Output: " this must be patch'd\nWith cloth of any colour.\n\nCOMINIUS:\nNay, come away.\n\nA Patrician:\nThis man ha"
--------------------------------------------------
Predictions: "iM.I$exLnCRupqlMrF'S;zLHMS:uRoISG:fIznTmRkE-'kp.c-yS3j3PHXSD:EtSk!nYZTkPu&TKcSNJIQo;KqoRUBnafxeMmoSr"


In [19]:
# from_logits是否预期为对数张量。默认情况下，我们假设对概率分布进行编码
# from_logits=False就表示把已经概率化了的输出，重新映射回原值。log（p/(1-p)）
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)


model.compile(optimizer='adam', loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.173581


In [20]:
# 保存模型
output_dir = './text_generation_lstm_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                      # 只保存权重的值
                                                      save_weights_only=True)
epochs = 100
history = model.fit(sequence_dataset, epochs=epochs, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
print(tf.train.latest_checkpoint(output_dir))

./text_generation_lstm_checkpoints/ckpt_100


In [22]:
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1,None]))
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
def generate_text(model, start_string, num_generate=1000):
    input_eval = [char2index[c] for c in start_string]
    print(input_eval)
    input_eval = tf.expand_dims(input_eval, 0)
    print(input_eval)
    text_generated = []
    # 对model进行reset，连续调用的时候使用resets_states()
    model.reset_states()
    # temperature > 1, 概率平缓 更加random  可以设置为2
    # temperature < 1, 概率更陡峭 更加greedy 可以设为0.5
    temperature = 2
    for _ in range(num_generate):
        predictions = model(input_eval)
        # predictions: logits -> softmax -> probability
        predictions = predictions / temperature
        # print(predictions.shape)
        predictions = tf.squeeze(predictions, 0)
        predicted_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy()
        # 得到预测id后，放入text_generated
        text_generated.append(index2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

In [24]:
new_text = generate_text(model2, 'All: ')
print(new_text)




[13, 50, 50, 10, 1]
tf.Tensor([[13 50 50 10  1]], shape=(1, 5), dtype=int32)
All: Give ck make up
A leg Romeo be cui,
Who sits confose me intoly, and your blood,
Or so dise, Sound all overhers' most king.

RIVERS:
To you shou.

POMPetty:
He noblen;
closescut the taming star; ybsenging it. Harwhbe bale-vex'd,
Then must I could prote your worship, ust rather sooted edward'sceing, know.
Your lirert, wes'tes
Even
As grant of train.

VINCENTIO:
Come, old Oxporm,
To rewre minutt out of holy degree,
By giving counsel and the king: whereow--bedg'd RItond for ever!

BLENS run;
Know, gladly know his life.
A mansi.
But where be d,
ten to only rds,
And divod we ot o!
You eck tongue
Than in proof upon Dercyou chal;
Nayiding hither bysome aughter here, take her i' the measurehat! 'Twas thy fault,
Rand on prebath sid?

Servant:
This is the lady's zeal, Kathappelo's winds; and thanks-risety.

DUKE VINCENTIO:
Not, T:
Th some: let us have hably not
changing god! no LADY ANNE:
Carry Lord!

PROSPERO:
Come