In [1]:
"""
created on 11 Mar 2019

@author: Huan Zheng
"""
# 使用LSTM生成文本

'\ncreated on 11 Mar 2019\n\n@author: Huan Zheng\n'

# 生成式循环网络简史
# 如何生成序列数据
# 采样策略的重要性
## 对于不同的softmax温度，对概率分布进行重新加权

In [2]:
import numpy as np

def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    return distribution / np.sum(distribution)

# 实现字符级的LSTM文本生成
## 下载并解析初始文本文件

In [1]:
import keras
import numpy as np

path = keras.utils.get_file('nietzsche', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Corpus length: 106496


## 将字符序列向量化

In [4]:
maxlen = 60 # 提取60个字符组成的序列
step = 3 # 每3个字符采样一个新序列
sentences = [] # 保存所提取的序列
next_chars = [] # 保存目标（即下一个字符）

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
    
print('Number of sequences:', len(sentences))

chars = sorted(list(set(text))) # 语料中唯一字符组成的集合
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars) # 一个字典，将唯一字符映射为它在列表chars中的索引

print('Vectorization...')
# one-hot编码
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 35479
Unique characters: 52
Vectorization...


## 用于预测下一个字符的单层LSTM模型

In [5]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

## 模型编译配置

In [6]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## 给定模型预测、采样下一个字符的函数

In [8]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## 文本生成循环

In [11]:
import random
import sys

for epoch in range(1, 20):
    print('epoch', epoch)
    model.fit(x, y, batch_size=256, epochs=1) # 将模型在数据上拟合一次
    start_index = random.randint(0, len(text) - maxlen - 1) # 随机选取一个文本种子
    generated_text = text[start_index: start_index + maxlen]
    print('---Generating with seed: "' + generated_text +'"')

for temperature in [0.2, 0.5, 1.0, 1.2]:
    print('--- temperature:', temperature)
    sys.stdout.write(generated_text)
    
    for i in range(400):
        # 对目前生成的字符进行one-hot编码
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1

        preds = model.predict(sampled, verbose=0)[0]
        next_idnex = sample(preds, temperature)
        next_char = chars[next_idnex]

        generated_text += next_char
        generated_text = generated_text[1:]

        sys.stdout.write(next_char)

epoch 1
Epoch 1/1
---Generating with seed: " really ought to free ourselves
from the misleading signific"
epoch 2
Epoch 1/1
---Generating with seed: " anywhere
previously; with such a tensely strained bow one c"
epoch 3
Epoch 1/1
---Generating with seed: "
herald-calls which summon the bravest to their bravery. boo"
epoch 4
Epoch 1/1
---Generating with seed: "ng be
only superficial valuations, special kinds of _niaiser"
epoch 5
Epoch 1/1
---Generating with seed: "ds
branches off and develops itself in organic processes (na"
epoch 6
Epoch 1/1
---Generating with seed: "lly only operate on "will"--and not on "matter" (not
on "ner"
epoch 7
Epoch 1/1
---Generating with seed: "trust--it is possible that the older psychologists had a
mer"
epoch 8
Epoch 1/1
---Generating with seed: " amounted to
the very inversion of truth, and the denial of "
epoch 9
Epoch 1/1
---Generating with seed: "injurious, obstructive,
blinding, and distorting manner. a p"
epoch 10
Epoch 1/1
---Generating with seed: "f

  This is separate from the ipykernel package so we can avoid doing imports until


ct-hims of will to deser is the clas forcestial doiblicts of the mang success, that it is that the very any haltic--- temperature: 1.0
cts of the mang success, that it is that the very any halticl was fort artunt of the however and the gover at in effect, without instincts of should beside and and bodulvess suitabition was for a stinting in a worth. ariato de"pialess in nament call; we have one's skeptofing of histing difficati and arturlfreen
and man--thes findless
and profound
surming in not dued the most see had the manguage,
fen the world, without a wable still for a distring man. of --- temperature: 1.2
fen the world, without a wable still for a distring man. of "beyog" to perhaps briuans of chisition, i may be views tastewwh--as without day the breall ypomes, in all the to ducienness and ressem
ence, not
on fries and vilues that nothing wien pencaue
to reard alon thinking relare he would so ngiaal part! and no enversares a discology
as it
is
necriation of of its oped of the is sa