<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/lstm/exp3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

使用lstm生成文本。

导入需要的包

In [28]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# 1 加载数据

In [8]:
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

## 使用keras的Tokenizer将文本转换为整数序列。

Tokenizer的原理是：按照词频对训练文本中的词降序排序，然后使用序号对词进行编码。例如：排名第一的词对应的整数值是1，排名第二的词对应的整数值是2。

fit_on_texts后有两个有用的输出：  
word_counts：词频统计结果  
word_index：词和index的对应关系，也就是词和整数的对应的关系  


In [3]:
# prepare the tokenizer on the source text
# 创建分词器 Tokenizer 对象
tokenizer = Tokenizer()
# 使用 data 训练分词器
tokenizer.fit_on_texts([data])

print(tokenizer.word_counts)
print(tokenizer.word_docs)
print(tokenizer.word_index) 
print(tokenizer.document_count)

OrderedDict([('jack', 2), ('and', 3), ('jill', 2), ('went', 1), ('up', 1), ('the', 1), ('hill', 1), ('to', 1), ('fetch', 1), ('a', 1), ('pail', 1), ('of', 1), ('water', 1), ('fell', 1), ('down', 1), ('broke', 1), ('his', 1), ('crown', 1), ('came', 1), ('tumbling', 1), ('after', 1)])
defaultdict(<class 'int'>, {'to': 1, 'jill': 1, 'came': 1, 'water': 1, 'broke': 1, 'his': 1, 'of': 1, 'fell': 1, 'after': 1, 'jack': 1, 'tumbling': 1, 'went': 1, 'the': 1, 'down': 1, 'fetch': 1, 'hill': 1, 'a': 1, 'and': 1, 'crown': 1, 'pail': 1, 'up': 1})
{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}
1


## 确定词汇量

In [4]:
# 确定词汇量（原始文本中词去重后的数量）
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


## 基于行创建序列

In [17]:
sequences = list()
for line in data.split('\n'):
  # 将文本转换为（整数）序列
  encoded = tokenizer.texts_to_sequences([line])[0]  # tokenizer.texts_to_sequences([line])的结果[[2, 1, 3, 4, 5, 6, 7]]
  print(encoded)
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
print(sequences[:10])

[2, 1, 3, 4, 5, 6, 7]
[]
[8, 9, 10, 11, 12, 13]
[]
[2, 14, 15, 1, 16, 17, 18]
[]
[1, 3, 19, 20, 21]
[]
Total Sequences: 21
[[2, 1], [2, 1, 3], [2, 1, 3, 4], [2, 1, 3, 4, 5], [2, 1, 3, 4, 5, 6], [2, 1, 3, 4, 5, 6, 7], [8, 9], [8, 9, 10], [8, 9, 10, 11], [8, 9, 10, 11, 12]]


## 填充序列

In [18]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre') # 左边填充0
print('Max Sequence Length: %d' % max_length)
print(sequences[:10])

Max Sequence Length: 7
[[ 0  0  0  0  0  2  1]
 [ 0  0  0  0  2  1  3]
 [ 0  0  0  2  1  3  4]
 [ 0  0  2  1  3  4  5]
 [ 0  2  1  3  4  5  6]
 [ 2  1  3  4  5  6  7]
 [ 0  0  0  0  0  8  9]
 [ 0  0  0  0  8  9 10]
 [ 0  0  0  8  9 10 11]
 [ 0  0  8  9 10 11 12]]


# 创建输入输出

In [19]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
print(X[:5])
print(y[:5])
y = to_categorical(y, num_classes=vocab_size) # 对输出进行one-hot编码

[[0 0 0 0 0 2]
 [0 0 0 0 2 1]
 [0 0 0 2 1 3]
 [0 0 2 1 3 4]
 [0 2 1 3 4 5]]
[1 3 4 5 6]


# 定义模型

Embedding：  
输入shape：形如（samples，sequence_length）的2D张量  
输出shape：形如(samples, sequence_length, output_dim)的3D张量  
嵌入层将正整数（下标）转换为具有固定大小的向量，如[[4],[20]]->[[0.25,0.1],[0.6,-0.2]]。[Embedding详解](https://blog.csdn.net/jiangpeng59/article/details/77533309) 



LSTM：  
输入shape：(samples, time_steps, input_dim)  
输出shape：(samples, output_dim)

*samples表示样本数量*


In [21]:
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=max_length-1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation='softmax'))
  # compile network
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  # plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [22]:
# define model
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 10)             220       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


# 训练模型

In [None]:
# fit network
model.fit(X, y, epochs=500, verbose=2)

# 评估模型

In [40]:
# generate a sequence from a language model
# 给定一个词，生成其后的n_words个词
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # pre-pad sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
    # predict probabilities for each word
    # yhat = model.predict_classes(encoded, verbose=0)
    yhat = np.argmax(model.predict(encoded))
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    # append to input
    in_text += ' ' + out_word
  return in_text

In [41]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Jill', 4))

Jack fell down and broke
Jill jill came tumbling after
